文件用于标准化 LLVM IR(Intermediate Representation)中的函数、模块、循环、SCC(Strongly Connected Components),并将处理后的 IR 以标准格式写入指定目录。
它可以用于 优化前后 IR 结构对比,以及 去除调试信息 以提升 IR 可读性和一致性。
目的是制作数据集。
涵盖的功能有:

  1. 标准化 IR 结构
    • 清除 BasicBlockInstruction 的名称,确保一致性。
    • 处理 FunctionModuleLoopLazyCallGraph::SCC 等不同 IR 单元。
  2. 文件存储
    • IR 处理后,存储到指定目录,按照 模块名-哈希值-时间戳-后缀.ll 进行命名,方便后续分析。
    • 利用 llvm::sys::fs::create_directories 自动创建存储目录,避免手动管理。
  3. 哈希计算
    • 采用 6 位十六进制哈希 计算函数名,避免冗长的文件名,并确保唯一性。
  4. 模块名优化
    • 去掉 LLVM 模块的完整路径,仅保留文件名,避免文件路径污染模块命名。
      #include "llvm/Analysis/LazyCallGraph.h"
      #include "llvm/Analysis/LoopInfo.h"
      #include "llvm/CodeGen/MachineFunction.h"
      #include "llvm/IR/BasicBlock.h"
      #include "llvm/IR/Function.h"
      #include "llvm/IR/Instruction.h"
      #include "llvm/IR/Module.h"
      #include "llvm/Support/FileSystem.h"
      #include "llvm/Support/Path.h"  // 关键修改:引入 LLVM 的路径处理库
      #include "llvm/Support/raw_ostream.h"
      #include <chrono>
      #include <ctime>
      #include <functional>
      #include <iomanip>
      #include <sstream>
      
      using namespace llvm;
      
      static void standardizeFunction(Function &F) {
        for (BasicBlock &BB : F) {
          BB.setName("");
          for (Instruction &I : BB) {
            I.setName("");
          }
        }
      }
      
      static void standardizeModule(Module &M) {
        for (Function &F : M)
          standardizeFunction(F);
      }
      
      static void standardizeLoop(Loop &L) {
        for (auto *BB : L.blocks()) {
          BB->setName("");
          for (Instruction &I : *BB)
            I.setName("");
        }
      }
      
      static void standardizeSCC(LazyCallGraph::SCC &S) {
        for (auto &N : S)
          standardizeFunction(N.getFunction());
      }
      
      static std::string get6HexHash(const std::string &Name) {
        std::hash<std::string> H;
        std::size_t V = H(Name);
        std::stringstream S;
        S << std::hex << std::setw(6) << std::setfill('0') << (V & 0xFFFFFF);
        return S.str();
      }
      
      template <typename IRUnitT>
      void standardizeAndWriteIR(IRUnitT &IR, const std::string &Suffix, const std::string &ShortTimestamp,
                                 const std::string &OutputBaseDir = "/home/yz/clean/DBGMASTER/IR_DUMP_DIR/") {
        std::string ModuleName = "UnknownModule";
        std::string FuncHash = "000000";
      
        // 获取模块名称
        if constexpr (std::is_same_v<IRUnitT, Module>) {
          ModuleName = IR.getName().str();
          if (ModuleName.empty())
            ModuleName = "UnknownModule";
          else
            ModuleName = sys::path::filename(ModuleName).str(); // 仅获取文件名
          standardizeModule(IR);
        } else if constexpr (std::is_same_v<IRUnitT, Function>) {
          Module *M = IR.getParent();
          if (M && !M->getName().empty())
            ModuleName = sys::path::filename(M->getName()).str();
          std::string FuncName = IR.getName().str();
          if (FuncName.empty())
            FuncName = "UnknownFunc";
          FuncHash = get6HexHash(FuncName);
          standardizeFunction(IR);
        } else if constexpr (std::is_same_v<IRUnitT, Loop>) {
          Function *F = IR.getHeader() ? IR.getHeader()->getParent() : nullptr;
          if (F) {
            Module *M = F->getParent();
            if (M && !M->getName().empty())
              ModuleName = sys::path::filename(M->getName()).str();
            std::string FuncName = F->getName().str();
            if (!FuncName.empty())
              FuncHash = get6HexHash(FuncName);
          }
          standardizeLoop(IR);
        } else if constexpr (std::is_same_v<IRUnitT, LazyCallGraph::SCC>) {
          ModuleName = "SCC";
          standardizeSCC(IR);
        } else {
          errs() << "Unknown IR type in standardizeAndWriteIR, skipping...\n";
          return;
        }
      
        // 计算最终输出目录(IR_DUMP_DIR/module_name/)
        std::string ModuleOutputDir = OutputBaseDir + ModuleName + "/";
        if (std::error_code EC = sys::fs::create_directories(ModuleOutputDir)) {
          errs() << "Error: Failed to create directory: " << ModuleOutputDir << " (" << EC.message() << ")\n";
          return;
        }
      
        // 计算输出文件名
        std::string OutputFilename = ModuleOutputDir + ModuleName + "-" + FuncHash + "-" + ShortTimestamp + "-" + Suffix + ".ll";
        
        // 打开文件进行写入
        std::error_code EC;
        raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OF_None);
        if (!EC) {
          if constexpr (std::is_same_v<IRUnitT, Module> || std::is_same_v<IRUnitT, Function>) {
            IR.print(OutFile, nullptr);
          } else {
            errs() << "Unsupported IR type for printing.\n";
          }
        } else {
          errs() << "Failed to write IR to file: " << OutputFilename << " (" << EC.message() << ")\n";
        }
      }

可以直接用这个 patch 包(仅针对 SimplifyCFG 当前):

diff --git a/llvm/include/llvm/Transforms/Utils/MyExtension/StandardIRPrinter.h b/llvm/include/llvm/Transforms/Utils/MyExtension/StandardIRPrinter.h
new file mode 100644
index 000000000000..29de766eac9f
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/MyExtension/StandardIRPrinter.h
@@ -0,0 +1,129 @@
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"  // 关键修改:引入 LLVM 的路径处理库
+#include "llvm/Support/raw_ostream.h"
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iomanip>
+#include <sstream>
+
+using namespace llvm;
+
+#define PRIOR_PRINTER(Any, PASS_NAME, TIMESTAMP_VAR) \
+  auto Now = std::chrono::system_clock::now(); \
+  auto TIMESTAMP_VAR = std::chrono::duration_cast<std::chrono::nanoseconds>(Now.time_since_epoch()).count(); \
+  std::string TIMESTAMP_VAR##Str = std::to_string(TIMESTAMP_VAR); \
+  std::string TIMESTAMP_VAR##Short = TIMESTAMP_VAR##Str.substr(TIMESTAMP_VAR##Str.length() - 8); \
+  LLVM_DEBUG(dbgs() << "TIMENOW-" << TIMESTAMP_VAR##Short << "\n"); \
+  standardizeAndWriteIR(Any, PASS_NAME "-before", TIMESTAMP_VAR##Short);
+
+#define POST_PRINTER(Any, PASS_NAME, TIMESTAMP_VAR) \
+  standardizeAndWriteIR(Any, PASS_NAME "-after", TIMESTAMP_VAR##Short);
+
+static void standardizeFunction(Function &F) {
+  for (BasicBlock &BB : F) {
+    BB.setName("");
+    for (Instruction &I : BB) {
+      I.setName("");
+    }
+  }
+}
+
+static void standardizeModule(Module &M) {
+  for (Function &F : M)
+    standardizeFunction(F);
+}
+
+static void standardizeLoop(Loop &L) {
+  for (auto *BB : L.blocks()) {
+    BB->setName("");
+    for (Instruction &I : *BB)
+      I.setName("");
+  }
+}
+
+static void standardizeSCC(LazyCallGraph::SCC &S) {
+  for (auto &N : S)
+    standardizeFunction(N.getFunction());
+}
+
+static std::string get6HexHash(const std::string &Name) {
+  std::hash<std::string> H;
+  std::size_t V = H(Name);
+  std::stringstream S;
+  S << std::hex << std::setw(6) << std::setfill('0') << (V & 0xFFFFFF);
+  return S.str();
+}
+
+template <typename IRUnitT>
+void standardizeAndWriteIR(IRUnitT &IR, const std::string &Suffix, const std::string &ShortTimestamp,
+                           const std::string &OutputBaseDir = "/home/yz/clean/DBGMASTER/IR_DUMP_DIR/") {
+  std::string ModuleName = "UnknownModule";
+  std::string FuncHash = "000000";
+
+  // 获取模块名称
+  if constexpr (std::is_same_v<IRUnitT, Module>) {
+    ModuleName = IR.getName().str();
+    if (ModuleName.empty())
+      ModuleName = "UnknownModule";
+    else
+      ModuleName = sys::path::filename(ModuleName).str(); // 仅获取文件名
+    standardizeModule(IR);
+  } else if constexpr (std::is_same_v<IRUnitT, Function>) {
+    Module *M = IR.getParent();
+    if (M && !M->getName().empty())
+      ModuleName = sys::path::filename(M->getName()).str();
+    std::string FuncName = IR.getName().str();
+    if (FuncName.empty())
+      FuncName = "UnknownFunc";
+    FuncHash = get6HexHash(FuncName);
+    standardizeFunction(IR);
+  } else if constexpr (std::is_same_v<IRUnitT, Loop>) {
+    Function *F = IR.getHeader() ? IR.getHeader()->getParent() : nullptr;
+    if (F) {
+      Module *M = F->getParent();
+      if (M && !M->getName().empty())
+        ModuleName = sys::path::filename(M->getName()).str();
+      std::string FuncName = F->getName().str();
+      if (!FuncName.empty())
+        FuncHash = get6HexHash(FuncName);
+    }
+    standardizeLoop(IR);
+  } else if constexpr (std::is_same_v<IRUnitT, LazyCallGraph::SCC>) {
+    ModuleName = "SCC";
+    standardizeSCC(IR);
+  } else {
+    errs() << "Unknown IR type in standardizeAndWriteIR, skipping...\n";
+    return;
+  }
+
+  // 计算最终输出目录(IR_DUMP_DIR/module_name/)
+  std::string ModuleOutputDir = OutputBaseDir + ModuleName + "/";
+  if (std::error_code EC = sys::fs::create_directories(ModuleOutputDir)) {
+    errs() << "Error: Failed to create directory: " << ModuleOutputDir << " (" << EC.message() << ")\n";
+    return;
+  }
+
+  // 计算输出文件名
+  std::string OutputFilename = ModuleOutputDir + ModuleName + "-" + FuncHash + "-" + ShortTimestamp + "-" + Suffix + ".ll";
+  
+  // 打开文件进行写入
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OF_None);
+  if (!EC) {
+    if constexpr (std::is_same_v<IRUnitT, Module> || std::is_same_v<IRUnitT, Function>) {
+      IR.print(OutFile, nullptr);
+    } else {
+      errs() << "Unsupported IR type for printing.\n";
+    }
+  } else {
+    errs() << "Failed to write IR to file: " << OutputFilename << " (" << EC.message() << ")\n";
+  }
+}
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 4e437e9abeb4..348c30934357 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
+#include "llvm/Transforms/Utils/MyExtension/StandardIRPrinter.h"
 #include <utility>
 using namespace llvm;

@@ -373,6 +374,7 @@ void SimplifyCFGPass::printPipeline(

 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
+  PRIOR_PRINTER(F, "simplifycfg", Timestamp);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   Options.AC = &AM.getResult<AssumptionAnalysis>(F);
   DominatorTree *DT = nullptr;
@@ -383,6 +385,7 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
   PreservedAnalyses PA;
   if (RequireAndPreserveDomTree)
     PA.preserve<DominatorTreeAnalysis>();
+  POST_PRINTER(F, "simplifycfg", Timestamp);
   return PA;
 }