自动批量解析bib工具
这个脚本 自动收集当前目录下所有 .bib 文件,解析其中的论文信息,并根据 会议名称和年份 生成 结构化的 Markdown 文件(会议名/年份/编号-论文标题.md)。它会 自动替换标题中的英文 : 为中文 :,并 映射常见会议名称(如 ASPLOS、CGO)以避免创建过长的文件夹名。默认情况下,它会 翻译论文标题和摘要(可使用 —no-translate 关闭),并使用 单一 tqdm 进度条 直观显示处理进度。整个流程 无需手动指定 .bib 文件,只需在 .bib 文件所在目录运行 python parse_bib.py,即可一键完成解析和存储! 🚀
参考如下代码:
import argparse
import asyncio
import os
import re
import glob
from googletrans import Translator
from tqdm import tqdm
# 会议名称映射表,避免创建过长的文件夹名称
CONFERENCE_NAME_ALIASES = {
"ACM-International-Conference-on-Architectural-Support-for-Programming-Languages-and-Operating-Systems": "ASPLOS",
"Proceedings-of-the-30th-ACM-International-Conference-on-Architectural-Support-for-Programming-Languages-and-Operating-Systems": "ASPLOS",
"ACM-International-Conference-on-Compiler-Construction": "CC",
"ACM-International-Conference-on-Programming-Language-Design-and-Implementation": "PLDI",
# 可自行扩展...
}
def simplify_conference_name(conference_name: str) -> str:
"""如果 conference_name 太长,则用映射表简化"""
lowered = conference_name.lower()
for pattern, alias in CONFERENCE_NAME_ALIASES.items():
if pattern.lower() in lowered:
return alias
return conference_name
def parse_bibtex_entry(entry: str) -> dict:
"""解析单个 BibTeX 条目,返回字段字典"""
fields = {}
matches = re.findall(r"(\w+)\s*=\s*\{([^}]*)\},?", entry, re.DOTALL)
for key, value in matches:
fields[key.lower()] = value.strip()
return fields
def parse_bibtex_file(bib_file: str) -> list:
"""解析整个 .bib 文件,返回所有条目列表"""
with open(bib_file, "r", encoding="utf-8") as f:
content = f.read()
entries = re.split(r"@inproceedings{", content)[1:] # 按 `@inproceedings` 分割
bib_entries = ["@inproceedings{" + e for e in entries]
return [parse_bibtex_entry(entry) for entry in bib_entries]
translator = Translator()
async def translate_text(text: str, dest_lang="zh-cn") -> str:
"""异步翻译文本"""
try:
translation = await translator.translate(text, dest=dest_lang, src='en')
return translation.text
except Exception:
return "[Translation failed]"
async def generate_markdown(entry: dict, output_folder: str, paper_number: int, translate: bool = True):
"""为每条 BibTeX 记录生成 Markdown 文件"""
title = entry.get("title", "Untitled").replace(":", ":") # 替换英文 `:` 为中文 `:`
authors = entry.get("author", "Unknown Authors")
year = entry.get("year", "Unknown Year")
abstract = entry.get("abstract", "No abstract available.")
doi = entry.get("doi", "")
url = entry.get("url", "")
keywords = entry.get("keywords", "No keywords")
booktitle = entry.get("booktitle", "Unknown Conference")
# 获取会议名称,简化处理
conference_name = simplify_conference_name(re.sub(r"[^a-zA-Z0-9]", "-", booktitle.strip()))
# 创建文件夹: standard-papers / 会议名 / 年份
year_folder = os.path.join(output_folder, conference_name, str(year))
os.makedirs(year_folder, exist_ok=True)
# 翻译标题、摘要(可选)
translated_title = await translate_text(title) if translate else "[Skipping translation]"
translated_abstract = await translate_text(abstract) if translate else "[Skipping translation]"
# 处理文件名中的非法字符
clean_title = re.sub(r'[\/:*?"<>|]', '-', title)
filename = f"{paper_number:02d}-{clean_title}.md"
filepath = os.path.join(year_folder, filename)
markdown_content = f"""# {title} **中文标题:** {translated_title}
**作者:** {authors} **年份:** {year} **会议/期刊名:** {booktitle} **关键词:** {keywords}
---
## Abstract {abstract}
**中文摘要:** {translated_abstract}
---
**DOI:** [{doi}]({url}) """
with open(filepath, "w", encoding="utf-8") as f:
f.write(markdown_content)
async def process_bibtex(output_folder: str = "standard-papers", translate: bool = True):
"""自动收集当前目录下所有 `.bib` 文件,并解析处理"""
os.makedirs(output_folder, exist_ok=True)
# 1️⃣ 自动收集所有 .bib 文件
bib_files = glob.glob("*.bib") # 当前目录下所有 .bib 文件
if not bib_files:
print("❌ No .bib files found in the current directory.")
return
# 2️⃣ 收集所有 BibTeX 条目
all_entries = []
for bib_file in bib_files:
all_entries.extend(parse_bibtex_file(bib_file))
# 3️⃣ 用单一进度条处理所有条目
with tqdm(total=len(all_entries), desc="Processing .bib files", unit="paper") as pbar:
for index, entry in enumerate(all_entries, start=1):
await generate_markdown(entry, output_folder, index, translate)
pbar.update(1)
def main():
parser = argparse.ArgumentParser(
description="Parse all BibTeX files in the current directory, generate MD files, optionally translate."
)
parser.add_argument(
"--output",
default="standard-papers",
help="Output folder (default: standard-papers)."
)
parser.add_argument(
"--no-translate",
action="store_true",
help="Disable translating title and abstract."
)
args = parser.parse_args()
translate = not args.no_translate
# 运行异步任务
asyncio.run(process_bibtex(args.output, translate))
if __name__ == "__main__":
main()
使用方法即:在当前脚本目录下放置多个 bib 文件,然后该脚本会自动根据你的 bib 去收集里面的信息并解析。
评论