这个脚本 自动收集当前目录下所有 .bib 文件,解析其中的论文信息,并根据 会议名称和年份 生成 结构化的 Markdown 文件(会议名/年份/编号-论文标题.md)。它会 自动替换标题中的英文 : 为中文 :,并 映射常见会议名称(如 ASPLOS、CGO)以避免创建过长的文件夹名。默认情况下,它会 翻译论文标题和摘要(可使用 —no-translate 关闭),并使用 单一 tqdm 进度条 直观显示处理进度。整个流程 无需手动指定 .bib 文件,只需在 .bib 文件所在目录运行 python parse_bib.py,即可一键完成解析和存储! 🚀

参考如下代码:

import argparse  
import asyncio  
import os  
import re  
import glob  
from googletrans import Translator  
from tqdm import tqdm  
  
# 会议名称映射表,避免创建过长的文件夹名称  
CONFERENCE_NAME_ALIASES = {  
    "ACM-International-Conference-on-Architectural-Support-for-Programming-Languages-and-Operating-Systems": "ASPLOS",  
    "Proceedings-of-the-30th-ACM-International-Conference-on-Architectural-Support-for-Programming-Languages-and-Operating-Systems": "ASPLOS",  
    "ACM-International-Conference-on-Compiler-Construction": "CC",  
    "ACM-International-Conference-on-Programming-Language-Design-and-Implementation": "PLDI",  
    # 可自行扩展...  
}  
  
def simplify_conference_name(conference_name: str) -> str:  
    """如果 conference_name 太长,则用映射表简化"""  
    lowered = conference_name.lower()  
    for pattern, alias in CONFERENCE_NAME_ALIASES.items():  
        if pattern.lower() in lowered:  
            return alias  
    return conference_name  
  
def parse_bibtex_entry(entry: str) -> dict:  
    """解析单个 BibTeX 条目,返回字段字典"""  
    fields = {}  
    matches = re.findall(r"(\w+)\s*=\s*\{([^}]*)\},?", entry, re.DOTALL)  
    for key, value in matches:  
        fields[key.lower()] = value.strip()  
    return fields  
  
def parse_bibtex_file(bib_file: str) -> list:  
    """解析整个 .bib 文件,返回所有条目列表"""  
    with open(bib_file, "r", encoding="utf-8") as f:  
        content = f.read()  
  
    entries = re.split(r"@inproceedings{", content)[1:]  # 按 `@inproceedings` 分割  
    bib_entries = ["@inproceedings{" + e for e in entries]  
    return [parse_bibtex_entry(entry) for entry in bib_entries]  
  
translator = Translator()  
  
async def translate_text(text: str, dest_lang="zh-cn") -> str:  
    """异步翻译文本"""  
    try:  
        translation = await translator.translate(text, dest=dest_lang, src='en')  
        return translation.text  
    except Exception:  
        return "[Translation failed]"  
  
async def generate_markdown(entry: dict, output_folder: str, paper_number: int, translate: bool = True):  
    """为每条 BibTeX 记录生成 Markdown 文件"""  
    title = entry.get("title", "Untitled").replace(":", ":")  # 替换英文 `:` 为中文 `:`  
    authors = entry.get("author", "Unknown Authors")  
    year = entry.get("year", "Unknown Year")  
    abstract = entry.get("abstract", "No abstract available.")  
    doi = entry.get("doi", "")  
    url = entry.get("url", "")  
    keywords = entry.get("keywords", "No keywords")  
    booktitle = entry.get("booktitle", "Unknown Conference")  
  
    # 获取会议名称,简化处理  
    conference_name = simplify_conference_name(re.sub(r"[^a-zA-Z0-9]", "-", booktitle.strip()))  
  
    # 创建文件夹: standard-papers / 会议名 / 年份  
    year_folder = os.path.join(output_folder, conference_name, str(year))  
    os.makedirs(year_folder, exist_ok=True)  
  
    # 翻译标题、摘要(可选)  
    translated_title = await translate_text(title) if translate else "[Skipping translation]"  
    translated_abstract = await translate_text(abstract) if translate else "[Skipping translation]"  
  
    # 处理文件名中的非法字符  
    clean_title = re.sub(r'[\/:*?"<>|]', '-', title)  
    filename = f"{paper_number:02d}-{clean_title}.md"  
    filepath = os.path.join(year_folder, filename)  
  
    markdown_content = f"""# {title}  **中文标题:** {translated_title}  
  
**作者:** {authors}  **年份:** {year}  **会议/期刊名:** {booktitle}  **关键词:** {keywords}    
---  
  
## Abstract  {abstract}  
  
**中文摘要:**  {translated_abstract}  
  
---  
  
**DOI:** [{doi}]({url})  """  
    with open(filepath, "w", encoding="utf-8") as f:  
        f.write(markdown_content)  
  
async def process_bibtex(output_folder: str = "standard-papers", translate: bool = True):  
    """自动收集当前目录下所有 `.bib` 文件,并解析处理"""  
    os.makedirs(output_folder, exist_ok=True)  
  
    # 1️⃣ 自动收集所有 .bib 文件  
    bib_files = glob.glob("*.bib")  # 当前目录下所有 .bib 文件  
    if not bib_files:  
        print("❌ No .bib files found in the current directory.")  
        return  
  
    # 2️⃣ 收集所有 BibTeX 条目  
    all_entries = []  
    for bib_file in bib_files:  
        all_entries.extend(parse_bibtex_file(bib_file))  
  
    # 3️⃣ 用单一进度条处理所有条目  
    with tqdm(total=len(all_entries), desc="Processing .bib files", unit="paper") as pbar:  
        for index, entry in enumerate(all_entries, start=1):  
            await generate_markdown(entry, output_folder, index, translate)  
            pbar.update(1)  
  
def main():  
    parser = argparse.ArgumentParser(  
        description="Parse all BibTeX files in the current directory, generate MD files, optionally translate."  
    )  
    parser.add_argument(  
        "--output",  
        default="standard-papers",  
        help="Output folder (default: standard-papers)."  
    )  
    parser.add_argument(  
        "--no-translate",  
        action="store_true",  
        help="Disable translating title and abstract."  
    )  
  
    args = parser.parse_args()  
    translate = not args.no_translate  
  
    # 运行异步任务  
    asyncio.run(process_bibtex(args.output, translate))  
  
if __name__ == "__main__":  
    main()

使用方法即:在当前脚本目录下放置多个 bib 文件,然后该脚本会自动根据你的 bib 去收集里面的信息并解析。