SLTrans
SLTrans
简易教程
git lfs install # 确保你安装了git-lfs
git clone https://hf-mirror.com/datasets/UKPLab/SLTrans
cd SLTrans
git lfs pull
git checkout main
然后把下面这个脚本放到目录里去:
import os
import pandas as pd
def list_parquet_files(directory):
"""递归列出目录中所有 Parquet 文件,并打印出路径"""
print(f"正在搜索目录: {directory}")
parquet_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".parquet"):
parquet_files.append(os.path.join(root, file))
print(f"找到 {len(parquet_files)} 个 Parquet 文件:")
for file in parquet_files:
print(f" {file}")
return parquet_files
def create_output_directory(output_path):
"""创建输出目录(如果不存在)"""
if not os.path.exists(output_path):
os.makedirs(output_path)
def write_source_file(source_code, output_path, file_name):
"""将源码写入文件"""
file_path = os.path.join(output_path, file_name)
with open(file_path, "w", encoding="utf-8") as f:
f.write(source_code)
def process_parquet_file(file_path, output_base_path, dataset_path, language_config):
"""处理单个 Parquet 文件,提取 Source_Code"""
try:
# 加载 Parquet 文件
df = pd.read_parquet(file_path)
print(f"处理文件: {file_path}")
print(f"发现 {len(df)} 条记录.")
# 确定语言文件夹名称(从父目录获取语言名称)
relative_path = os.path.relpath(file_path, dataset_path)
language = relative_path.split(os.sep)[0] # 语言名称即为文件夹名
# 检查语言是否在配置中定义
if language not in language_config:
print(f"警告: 未在配置中找到语言 {language} 的设置,跳过文件 {file_path}")
return
# 为每个 .parquet 文件创建独立的子文件夹
parquet_basename = os.path.basename(file_path).split('.')[0] # 不包含扩展名
output_path = os.path.join(output_base_path, language, parquet_basename)
create_output_directory(output_path)
# 遍历 Parquet 文件中的记录
for idx, row in df.iterrows():
source_code = row['Source_Code']
file_extension = language_config.get(language, {}).get("file_extension", ".txt")
file_name = f"source_{idx + 1}{file_extension}" # 文件名从 1 开始
write_source_file(source_code, output_path, file_name)
print(f"处理完成: {file_path},输出到目录 {output_path}")
except Exception as e:
print(f"处理文件 {file_path} 时出错: {e}")
def main():
# 硬编码路径和语言配置
dataset_path = "./" # 替换为实际的主数据集路径
output_base_path = os.path.join(dataset_path, "output")
language_config = {
"C": {"file_extension": ".c"},
"C++": {"file_extension": ".cpp"},
"D": {"file_extension": ".d"},
"Fortran": {"file_extension": ".f90"},
"Go": {"file_extension": ".go"},
"Haskell": {"file_extension": ".hs"},
"Nim": {"file_extension": ".nim"},
"Objective-C": {"file_extension": ".m"},
"Python": {"file_extension": ".py"},
"Rust": {"file_extension": ".rs"},
"Swift": {"file_extension": ".swift"}
}
# 打印调试信息
print(f"数据集主路径: {dataset_path}")
print(f"输出路径: {output_base_path}")
# 列出所有 Parquet 文件(直接从主路径下的语言子目录中查找)
parquet_files = list_parquet_files(dataset_path)
if parquet_files:
print(f"找到 {len(parquet_files)} 个 Parquet 文件.")
# 串行处理每个文件
for file_path in parquet_files:
process_parquet_file(file_path, output_base_path, dataset_path, language_config)
print("所有文件已处理完成!")
else:
print("未找到任何 Parquet 文件,请检查路径是否正确。")
if __name__ == "__main__":
main()
然后使用python运行就行,结果在output文件夹中,验证方法:
❯ du -h --max-depth=1
222M ./Haskell
74M ./D
2.3M ./Objective-C
1.7G ./Rust
2.7G ./C
1.1M ./Nim
34M ./Go
54M ./Fortran
16M ./Swift
1.2G ./Python
24G ./C++
30G .
❯ find . -type f | wc -l
6978538
文件数量对上就行。
评论