有关指令调度的学术笔记已经有比较多了,这里只列举一些我觉得需要注意的内容,其余的可以直接参考“附录的参考文献”。如果你对下面的概念都熟悉,说明你已经有了初步的基础,反之,你可以去了解前置需要学习的概念。(其中,参考文献一最好)
后续内容主要在于对 LLVM 中的实际内容的解读,而不是一下这些通用概念

  • Throughput
  • Latency
  • reservation station
  • 静态调度 vs 动态调度
  • IssueWidth
  • 数据型冒险
  • 结构型冒险
  • 控制型冒险
  • 流水线 stall
  • RAW, WAR, WAW
  • 指令调度与寄存器分配

image-20250310192203331.webp

①SelectionDAGISel

📌 阶段: 指令选择(Instruction Selection)后,在 DAG(Directed Acyclic Graph)上进行调度
📌 主要 Pass: SelectionDAGISel
📌 优化目标: 决定指令的发射顺序,减少流水线停顿,提高吞吐量
可选算法:

调度算法 说明
list-burr 最老的 List Scheduling,基于 bottom-up register reduction
list-hybrid 列表调度,平衡延迟和寄存器压力
list-ilp 列表调度,平衡 ILP 和寄存器压力
linearize 按 LLVM IR 顺序执行(不调度)
fast 启发式方法,快速调度,牺牲部分优化
default 目标平台决定,也是默认的

②MachineScheduler

enable-mischedenable-post-misched 选项通常默认开启。但是 post-RA-scheduler 默认关闭。

源码解读①preRASched

指令调度的配置位于 SelectionDAGISel. cpp:

ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {  
  return ISHeuristic(this, OptLevel);  
}

ScheduleDAGSDNodes *createDefaultScheduler(SelectionDAGISel *IS,  
                                           CodeGenOptLevel OptLevel) {  
  const TargetLowering *TLI = IS->TLI;  
  const TargetSubtargetInfo &ST = IS->MF->getSubtarget();  
  
  // Try first to see if the Target has its own way of selecting a scheduler  
  if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) {  
    return SchedulerCtor(IS, OptLevel);  
  }  
  if (OptLevel == CodeGenOptLevel::None ||  
      (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) ||  
      TLI->getSchedulingPreference() == Sched::Source)  
    return createSourceListDAGScheduler(IS, OptLevel);  
  if (TLI->getSchedulingPreference() == Sched::RegPressure)  
    return createBURRListDAGScheduler(IS, OptLevel);  
  if (TLI->getSchedulingPreference() == Sched::Hybrid)  
    return createHybridListDAGScheduler(IS, OptLevel);  
  if (TLI->getSchedulingPreference() == Sched::VLIW)  
    return createVLIWDAGScheduler(IS, OptLevel);  
  if (TLI->getSchedulingPreference() == Sched::Fast)  
    return createFastDAGScheduler(IS, OptLevel);  
  if (TLI->getSchedulingPreference() == Sched::Linearize)  
    return createDAGLinearizer(IS, OptLevel);  
  assert(TLI->getSchedulingPreference() == Sched::ILP &&  
         "Unknown sched type!");  
  return createILPListDAGScheduler(IS, OptLevel);  
}

如果不加命令行参数选择特定指令调度器,会进入 createSourceListDAGScheduler
ScheduleDAGSDNodes *  
llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,  
                                   CodeGenOptLevel OptLevel) {  
  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();  
  const TargetInstrInfo *TII = STI.getInstrInfo();  
  const TargetRegisterInfo *TRI = STI.getRegisterInfo();  
  
  SrcRegReductionPriorityQueue *PQ =  
    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);  
  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);  
  PQ->setScheduleDAG(SD);  # 核心语句
  return SD;  
}

指令调度的真正执行在:
ScheduleDAGSDNodes *Scheduler = CreateScheduler();  
{  
  NamedRegionTimer T("sched", "Instruction Scheduling", GroupName,  
                     GroupDescription, TimePassesIsEnabled);  
  Scheduler->Run(CurDAG, FuncInfo->MBB);  
}

进一步: ScheduleDAGSDNodes. cpp
/// Run - perform scheduling.  
///  
void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {  
  BB = bb;  
  DAG = dag;  
  
  // Clear the scheduler's SUnit DAG.  
  ScheduleDAG::clearDAG();  
  Sequence.clear();  
  
  // Invoke the target's selection of scheduler.  
  Schedule();  # step into
}

下一步,终于有日志提示了:
/// Schedule - Schedule the DAG using list scheduling.  
void ScheduleDAGRRList::Schedule() {  
  LLVM_DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)  
                    << " '" << BB->getName() << "' **********\n");  
  
  CurCycle = 0;  
  IssueCount = 0;  
  MinAvailableCycle =  
      DisableSchedCycles ? 0 : std::numeric_limits<unsigned>::max();  
  NumLiveRegs = 0;  
  // Allocate slots for each physical register, plus one for a special register  
  // to track the virtual resource of a calling sequence.  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());  
  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());  
  CallSeqEndForStart.clear();  
  assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");  
  
  // Build the scheduling graph.  
  BuildSchedGraph(nullptr);  
  
  LLVM_DEBUG(dump());  
  Topo.MarkDirty();  
  
  AvailableQueue->initNodes(SUnits);  
  
  HazardRec->Reset();  
  
  // Execute the actual scheduling loop.  
  ListScheduleBottomUp();  # step into
  
  AvailableQueue->releaseState();  
  
  LLVM_DEBUG({  
    dbgs() << "*** Final schedule ***\n";  
    dumpSchedule();  
    dbgs() << '\n';  
  });}

下面可能会有层层嵌套的函数,处理完才开始真正调度每条指令:
/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up  
/// schedulers.  
void ScheduleDAGRRList::ListScheduleBottomUp() {  
  // Release any predecessors of the special Exit node.  
  ReleasePredecessors(&ExitSU);                          # 释放前驱(前驱已经ready)
  
  // Add root to Available queue.  
  if (!SUnits.empty()) {  
    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];  
    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");  
    RootSU->isAvailable = true;  
    AvailableQueue->push(RootSU);                        # 加入root节点:调度的起点
  }  
  // While Available queue is not empty, grab the node with the highest  
  // priority. If it is not ready put it back.  Schedule the node.  Sequence.reserve(SUnits.size());  
  while (!AvailableQueue->empty() || !Interferences.empty()) {   # 主流程都在这个循环体内
    LLVM_DEBUG(dbgs() << "\nExamining Available:\n";  
               AvailableQueue->dump(this));  
  
    // Pick the best node to schedule taking all constraints into  
    // consideration.    
    SUnit *SU = PickNodeToScheduleBottomUp();                      # 选择最佳的指令(后面来看选择的策略)
  
    AdvancePastStalls(SU);                                          # 处理流水线停顿
  
    ScheduleNodeBottomUp(SU);                                      # 执行调度、处理给中依赖
  
    while (AvailableQueue->empty() && !PendingQueue.empty()) {  # 这一段的含义是假如此时没有任何指令可以调度(全部要等待流水线),那么增加周期到第一个可以调度的指令!
      // Advance the cycle to free resources. Skip ahead to the next ready SU.  
      assert(MinAvailableCycle < std::numeric_limits<unsigned>::max() &&  
             "MinAvailableCycle uninitialized");  
      AdvanceToCycle(std::max(CurCycle + 1, MinAvailableCycle));  
    }  }  
  // Reverse the order if it is bottom up.  
  std::reverse(Sequence.begin(), Sequence.end());   # 自底向上需要返序
  
#ifndef NDEBUG  
  VerifyScheduledSequence(/*isBottomUp=*/true);  
#endif  
}

那么这一块引入了四个核心函数,我们一起看一下:

  • PickNodeToScheduleBottomUp:逻辑较为复杂,简要为:
    • 尝试从 AvailableQueue 选择最高优先级的指令 (CurSU),并检查其是否因寄存器冲突而需要延迟 (DelayForLiveRegsBottomUp)。
    • 若有寄存器冲突,则将 CurSU 放入 Interferences 队列,并继续尝试下一个候选指令。若所有候选指令都因寄存器冲突而无法调度
      • 尝试回溯 (BacktrackBottomUp) 以重新安排先前的指令,使当前指令可执行。
      • 如果回溯失败,则尝试复制 (Duplicate) 相关指令插入额外的寄存器复制指令 (InsertCopiesAndMoveSuccs) 以打破冲突。
    • 最终选定一个可以调度的指令并返回 (CurSU),确保指令调度过程不会卡住。
  • AdvancePastStalls:这个函数的作用是确保当前调度单元 (SU) 可以被安全调度,即所有依赖的指令都已执行完毕,并且资源无冲突。首先,它推进当前调度周期 (CurCycle) 至 SU 最早可执行的时间点 (ReadyCycle),确保其依赖的指令已完成。然后,它检查 SU 是否存在资源冲突(例如流水线冲突),如果有,则不断增加等待周期 (Stalls),直到冲突消除。最终,它确保 SU 在正确的周期调度,不会因依赖未就绪或资源冲突而出错。
  • ScheduleNodeBottomUp: 采用贪心策略进行指令调度,它在每个周期优先调度当前可用、优先级最高的指令,并尽可能释放资源,使后续指令尽早可用。它主要通过寄存器可用性、数据冒险检测、流水线资源等因素来决定何时推进调度周期,以保证指令的最大并行度。==优先级最高,如何来,我们后面再继续讲==
  • AdvanceToCycle:这一块理解就可,不多叙述

    其中最简单的 linearize 调度方法

    void ScheduleDAGLinearize::Schedule() {  
      LLVM_DEBUG(dbgs() << "********** DAG Linearization **********\n");  
      
      SmallVector<SDNode*, 8> Glues;  
      unsigned DAGSize = 0;  
      for (SDNode &Node : DAG->allnodes()) {  
        SDNode *N = &Node;  
      
        // Use node id to record degree.  
        unsigned Degree = N->use_size();  
        N->setNodeId(Degree);  
        unsigned NumVals = N->getNumValues();  
        if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&  
            N->hasAnyUseOfValue(NumVals-1)) {  
          SDNode *User = findGluedUser(N);  
          if (User) {  
            Glues.push_back(N);  
            GluedMap.insert(std::make_pair(N, User));  
          }    }  
        if (N->isMachineOpcode() ||  
            (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))  
          ++DAGSize;  
      }  
      for (SDNode *Glue : Glues) {  
        SDNode *GUser = GluedMap[Glue];  
        unsigned Degree = Glue->getNodeId();  
        unsigned UDegree = GUser->getNodeId();  
      
        // Glue user must be scheduled together with the glue operand. So other  
        // users of the glue operand must be treated as its users.    SDNode *ImmGUser = Glue->getGluedUser();  
        for (const SDNode *U : Glue->uses())  
          if (U == ImmGUser)  
            --Degree;  
        GUser->setNodeId(UDegree + Degree);  
        Glue->setNodeId(1);  
      }  
      Sequence.reserve(DAGSize);  
      ScheduleNode(DAG->getRoot().getNode());  
    }
    其简单逻辑是:
  • 遍历 DAG(Directed Acyclic Graph)中的所有节点,基于 使用关系 (use_size()) 计算调度顺序。
  • 处理 Glue 依赖(特殊的 MVT:: Glue 连接),保证 Glue 相关的指令必须被一起调度。
  • 线性化 DAG,并存入 Sequence,以便后续指令选择和寄存器分配。

    源码解读 PostRASched

    源码位于 PostRASchdedulerList. cpp,默认关闭,核心位于 runOnFuntion:逻辑和 preRA 类似
    bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {  
      if (skipFunction(Fn.getFunction()))  
        return false;  
      
      TII = Fn.getSubtarget().getInstrInfo();  
      MachineLoopInfo &MLI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();  
      AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();  
      TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();  
      
      RegClassInfo.runOnMachineFunction(Fn);  
      
      TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =  
        TargetSubtargetInfo::ANTIDEP_NONE;  
      SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;  
      
      // Check that post-RA scheduling is enabled for this target.  
      // This may upgrade the AntiDepMode.  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),  
                                 AntiDepMode, CriticalPathRCs))  
        return false;  
      
      // Check for antidep breaking override...  
      if (EnableAntiDepBreaking.getPosition() > 0) {  
        AntiDepMode = (EnableAntiDepBreaking == "all")  
          ? TargetSubtargetInfo::ANTIDEP_ALL  
          : ((EnableAntiDepBreaking == "critical")  
             ? TargetSubtargetInfo::ANTIDEP_CRITICAL  
             : TargetSubtargetInfo::ANTIDEP_NONE);  
      }  
      LLVM_DEBUG(dbgs() << "PostRAScheduler\n");  
      
      SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,  
                                     CriticalPathRCs);  
      
      // Loop over all of the basic blocks  
      for (auto &MBB : Fn) {  
    #ifndef NDEBUG  
        // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod  
        if (DebugDiv > 0) {  
          static int bbcnt = 0;  
          if (bbcnt++ % DebugDiv != DebugMod)  
            continue;  
          dbgs() << "*** DEBUG scheduling " << Fn.getName() << ":"  
                 << printMBBReference(MBB) << " ***\n";  
        }#endif  
      
        // Initialize register live-range state for scheduling in this block.  
        Scheduler.startBlock(&MBB);  
      
        // Schedule each sequence of instructions not interrupted by a label  
        // or anything else that effectively needs to shut down scheduling.    MachineBasicBlock::iterator Current = MBB.end();  
        unsigned Count = MBB.size(), CurrentCount = Count;  
        for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) {  
          MachineInstr &MI = *std::prev(I);  
          --Count;  
          // Calls are not scheduling boundaries before register allocation, but  
          // post-ra we don't gain anything by scheduling across calls since we      // don't need to worry about register pressure.      if (MI.isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {  
            Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count);  
            Scheduler.setEndIndex(CurrentCount);  
            Scheduler.schedule();  
            Scheduler.exitRegion();  
            Scheduler.EmitSchedule();  
            Current = &MI;  
            CurrentCount = Count;  
            Scheduler.Observe(MI, CurrentCount);  
          }      I = MI;  
          if (MI.isBundle())  
            Count -= MI.getBundleSize();  
        }    assert(Count == 0 && "Instruction count mismatch!");  
        assert((MBB.begin() == Current || CurrentCount != 0) &&  
               "Instruction count mismatch!");  
        Scheduler.enterRegion(&MBB, MBB.begin(), Current, CurrentCount);  
        Scheduler.setEndIndex(CurrentCount);  
        Scheduler.schedule();  
        Scheduler.exitRegion();  
        Scheduler.EmitSchedule();  
      
        // Clean up register live-range state.  
        Scheduler.finishBlock();  
      
        // Update register kills  
        Scheduler.fixupKills(MBB);  
      }  
      return true;  
    }

源码解读 machine-scheduler(RA 前和 RA 后都在一个文件)

入口位于 bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {

image-20250313205544549.webp
最深层的选择最优策略在 tryCandidate 里:
将按照以下优先级执行:

  1. 物理寄存器偏好
    • 目标是让物理寄存器的定义和使用尽量相邻,减少物理寄存器的生命周期,降低寄存器竞争。
  2. 寄存器压力控制
    • 避免调度会导致寄存器溢出的指令,防止因寄存器不足导致性能下降或溢出到内存(增加额外的 spill/fill 负担)。
  3. 关键寄存器压力控制
    • 额外关注那些会导致关键寄存器集合(如浮点寄存器组、通用寄存器组)压力过高的指令,避免让关键寄存器成为瓶颈。
  4. 资源占用控制
    • 避免调度会过度占用执行资源(如 ALU、FPU 或 Load/Store 单元)的指令,确保资源能均衡使用,不会在后续指令调度时出现资源不足的情况。
  5. 指令延迟优化
    • 优先调度高延迟指令(如乘法、内存访问),使其尽早开始执行,从而减少流水线的等待时间,提高整体吞吐量。
  6. 指令簇优化
    • 如果两条指令属于同一个优化簇(Cluster,例如连续的 Load/Store 操作),那么会优先调度属于相同簇的指令,以减少乱序执行导致的额外调度开销。
  7. 弱依赖优化
    • 优先调度依赖较少的指令,让它们更快地完成并释放依赖关系,使更多的指令可以进入可调度状态,提高整体指令吞吐量。
  8. 避免增加全局寄存器压力
    • 除了局部寄存器压力外,还会检查整个程序区域的寄存器使用情况,避免调度导致寄存器长期占用过高,影响后续调度。
  9. 回退到指令原始顺序
    • 如果所有的启发式规则都无法区分候选指令的优先级,则按照代码原始顺序调度,以保持代码结构的稳定性。

    Bug 发现

    我需要研究一下为什么在开启 O1/O2/O3 时有时不能使用 -mllvm -pre-RA-sched=linearize 的问题。可能是另一个 Issue

    源码解析

    想要在 pre-RA-sched 阶段拿到他是对什么内容进行调度的,可以观察如下源码:
    ScheduleDAGSDNodes 类中有一个 std::vector<SUnit*> Sequence;,而 Seqence 正是需要调度的序列。
    -view-sched-dags
    但是哪一个对性能提升最大呢?(有待数据说话)
    GPT 看法:
    ScheduleDAGSDNodes(影响较小)
  10. 需要较好地 控制 IR 到 MachineInstr 生成的初始顺序,降低寄存器压力。
  11. 作用范围仅限 SDNode,不考虑 CPU 执行单元的调度。
  12. 之后 MIScheduler 仍可能会大幅调整顺序。
  13. 无法优化 Load/Store 指令的内存访问模式,也不会对 Pipeline Stall 进行优化。
    MIScheduler(影响更大,重点优化目标)
  14. 指令并行性(ILP)优化:让 CPU 尽可能同时执行多个指令。
  15. 避免流水线 Stalls:调整 Load/Store 顺序,优化 Cache 访问。
  16. 分支预测优化:减少控制依赖带来的损失。
  17. 寄存器使用优化:减少寄存器重命名冲突。
  18. 现代超标量 CPU,可大幅减少 Pipeline Stall,提升指令吞吐量。
    ScheduleDAGSDNodes 主要是优化 MachineInstr 生成的初始顺序,但后续 MIScheduler 仍可能会覆盖其优化效果。除非你的目标是 特定 ISA(如 VLIW),否则 最终执行效率主要取决于 MIScheduler

日志分析

我自己简单插了几个桩,打印了一些调度前后的日志。

ScheduleDAGISel 阶段的日志

=======Before List Scheduling
SU(0): t8: ch = RET_ReallyLR Register:i32 $w0, t7, t7:1

    t7: ch,glue = CopyToReg t4, Register:i32 $w0, t10

  # preds left       : 2
  # succs left       : 0
  # rdefs left       : 0
  Latency            : 1
  Depth              : 2
  Height             : 0
  Predecessors:
    SU(2): Ord  Latency=1 Barrier
    SU(1): Data Latency=1
SU(1): t10: i32,ch = CopyFromReg t0, Register:i32 $wzr

  # preds left       : 0
  # succs left       : 1
  # rdefs left       : 1
  Latency            : 1
  Depth              : 0
  Height             : 1
  Successors:
    SU(0): Data Latency=1
SU(2): t4: ch = lifetime.end<0 to 4> t2, TargetFrameIndex:i64<0>

  # preds left       : 1
  # succs left       : 1
  # rdefs left       : 0
  Latency            : 1
  Depth              : 1
  Height             : 1
  Predecessors:
    SU(3): Ord  Latency=1 Barrier
  Successors:
    SU(0): Ord  Latency=1 Barrier
SU(3): t2: ch = lifetime.end<0 to 4> t0, TargetFrameIndex:i64<1>

  # preds left       : 0
  # succs left       : 1
  # rdefs left       : 0
  Latency            : 1
  Depth              : 0
  Height             : 2
  Successors:
    SU(2): Ord  Latency=1 Barrier
=======Dump Pre List Scheduling
=======Dump List Scheduling
SU(3): t2: ch = lifetime.end<0 to 4> t0, TargetFrameIndex:i64<1>

SU(2): t4: ch = lifetime.end<0 to 4> t2, TargetFrameIndex:i64<0>

SU(1): t10: i32,ch = CopyFromReg t0, Register:i32 $wzr

SU(0): t8: ch = RET_ReallyLR Register:i32 $w0, t7, t7:1

    t7: ch,glue = CopyToReg t4, Register:i32 $w0, t10

这一段日志清晰可读,相对后面的你们就知道了。毕竟 SDNode(SUnit)在 DAG 里还是相对简化的内容。
我做了一个有意思的实验,把 ScheduleDAGRRList 里的std::reverse(Sequence.begin(), Sequence.end()); 注释掉了,结果断言错误了:
Assertion failed: (I != VRBaseMap.end() && "Node emitted out of order - late"), function getVR, file InstrEmitter.cpp, line 284. 这是在调度后的一个步骤。这个可能是为后续寄存器分配的分析时的错误。

  • VRBaseMap 中没有 Op 的映射,即:
    • InstrEmitter::EmitMachineNode() 可能还没给 Op 分配 VReg。
    • Op 可能本应该更早出现,但由于 Sequence 调度顺序问题,导致 Op 还没被 EmitMachineNode() 处理。

ScheduleDAGMI

下面这就比较逆天了, 因为是 MIR,里面可以说啥信息都有,都附加在了指令上面,所以非常冗长

========= Before MSched
********** INTERVALS **********
WSP [16r,16d:23)[64r,64d:22)[80r,80d:21)[96r,96d:20)[224r,224d:19)[240r,240d:18)[688r,688d:17)[832r,832d:16)[848r,848d:15)[944r,944d:14)[1056r,1056d:13)[1072r,1072d:12)[1168r,1168d:11)[1280r,1280d:10)[1296r,1296d:9)[1408r,1408d:8)[1536r,1536d:7)[1552r,1552d:6)[1648r,1648d:5)[1760r,1760d:4)[1776r,1776d:3)[1824r,1824d:2)[1872r,1872d:1)[1888r,1888d:0) 0@1888r 1@1872r 2@1824r 3@1776r 4@1760r 5@1648r 6@1552r 7@1536r 8@1408r 9@1296r 10@1280r 11@1168r 12@1072r 13@1056r 14@944r 15@848r 16@832r 17@688r 18@240r 19@224r 20@96r 21@80r 22@64r 23@16r
%12 [32r,48r:0) 0@32r  weight:0.000000e+00
%15 [128r,144r:0) 0@128r  weight:0.000000e+00
%16 [160r,176r:0) 0@160r  weight:0.000000e+00
%17 [192r,208r:0) 0@192r  weight:0.000000e+00
%25 [480r,512r:0) 0@480r  weight:0.000000e+00
%26 [496r,512r:0) 0@496r  weight:0.000000e+00
%27 [512r,528r:0) 0@512r  weight:0.000000e+00
%28 [544r,560r:0) 0@544r  weight:0.000000e+00
%31 [288r,720r:0) 0@288r  weight:0.000000e+00
%34 [272r,752r:0) 0@272r  weight:0.000000e+00
%36 [256r,784r:0) 0@256r  weight:0.000000e+00
%38 [800r,816r:0) 0@800r  weight:0.000000e+00
%40 [304r,976r:0) 0@304r  weight:0.000000e+00
%43 [880r,1008r:0) 0@880r  weight:0.000000e+00
%45 [912r,992r:0) 0@912r  weight:0.000000e+00
%47 [1024r,1040r:0) 0@1024r  weight:0.000000e+00
%49 [320r,1200r:0) 0@320r  weight:0.000000e+00
%52 [1104r,1232r:0) 0@1104r  weight:0.000000e+00
%54 [1136r,1216r:0) 0@1136r  weight:0.000000e+00
%56 [1248r,1264r:0) 0@1248r  weight:0.000000e+00
%60 [1376r,1488r:0) 0@1376r  weight:0.000000e+00
%62 [1312r,1472r:0) 0@1312r  weight:0.000000e+00
%64 [1504r,1520r:0) 0@1504r  weight:0.000000e+00
%66 [432r,464B:1)[560r,624B:2)[624B,1680r:0) 0@624B-phi 1@432r 2@560r  weight:0.000000e+00
%69 [1584r,1712r:0) 0@1584r  weight:0.000000e+00
%71 [1616r,1696r:0) 0@1616r  weight:0.000000e+00
%73 [1728r,1744r:0) 0@1728r  weight:0.000000e+00
%75 [1840r,1856r:0) 0@1840r  weight:0.000000e+00
%78 [400r,464B:0)[528r,624B:1)[624B,1440r:2) 0@400r 1@528r 2@624B-phi  weight:0.000000e+00
RegMasks: 64r 224r 832r 1056r 1280r 1536r 1760r 1872r
********** MACHINEINSTRS **********
# Machine code for function main: NoPHIs, TracksLiveness, TiedOpsRewritten
Frame Objects:
  fi#0: size=4, align=4, at location [SP]
  fi#1: size=4, align=4, at location [SP]

0B      bb.0.entry:
          successors: %bb.1(0x30000000), %bb.2(0x50000000); %bb.1(37.50%), %bb.2(62.50%)

16B       ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
32B       %12:gpr64common = MOVaddr target-flags(aarch64-page) @.str, target-flags(aarch64-pageoff, aarch64-nc) @.str
48B       $x0 = COPY %12:gpr64common
64B       BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
80B       ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
96B       ADJCALLSTACKDOWN 16, 0, implicit-def dead $sp, implicit $sp
128B      %15:gpr64common = ADDXri %stack.1.b, 0, 0
144B      STRXui %15:gpr64common, $sp, 1 :: (store (s64) into stack + 8)
160B      %16:gpr64common = ADDXri %stack.0.a, 0, 0
176B      STRXui %16:gpr64common, $sp, 0 :: (store (s64) into stack)
192B      %17:gpr64common = MOVaddr target-flags(aarch64-page) @.str.1, target-flags(aarch64-pageoff, aarch64-nc) @.str.1
208B      $x0 = COPY %17:gpr64common
224B      BL @scanf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
240B      ADJCALLSTACKUP 16, 0, implicit-def dead $sp, implicit $sp
256B      undef %36.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
272B      undef %34.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
288B      undef %31.sub_32:gpr64 = nsw ADDWrr %34.sub_32:gpr64, %36.sub_32:gpr64
304B      undef %40.sub_32:gpr64 = nsw SUBWrr %36.sub_32:gpr64, %34.sub_32:gpr64
320B      undef %49.sub_32:gpr64 = nsw MADDWrrr %34.sub_32:gpr64, %36.sub_32:gpr64, $wzr
336B      CBNZW %34.sub_32:gpr64, %bb.2

352B    bb.1:
        ; predecessors: %bb.0
          successors: %bb.3(0x80000000); %bb.3(100.00%)

400B      %78:fpr64 = FMOVD0
432B      undef %66.sub_32:gpr64 = COPY $wzr
448B      B %bb.3

464B    bb.2.cond.true5:
        ; predecessors: %bb.0
          successors: %bb.3(0x80000000); %bb.3(100.00%)

480B      %25:fpr32 = nofpexcept SCVTFUWSri %36.sub_32:gpr64, implicit $fpcr
496B      %26:fpr32 = nofpexcept SCVTFUWSri %34.sub_32:gpr64, implicit $fpcr
512B      %27:fpr32 = nofpexcept FDIVSrr %25:fpr32, %26:fpr32, implicit $fpcr
528B      %78:fpr64 = nofpexcept FCVTDSr %27:fpr32, implicit $fpcr
544B      %28:gpr32 = SDIVWr %36.sub_32:gpr64, %34.sub_32:gpr64
560B      undef %66.sub_32:gpr64 = MSUBWrrr %28:gpr32, %34.sub_32:gpr64, %36.sub_32:gpr64

624B    bb.3.cond.end7:
        ; predecessors: %bb.2, %bb.1
          successors: %bb.5(0x30000000), %bb.4(0x50000000); %bb.5(37.50%), %bb.4(62.50%)

688B      ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
720B      STRXui %31:gpr64, $sp, 2 :: (store (s64) into stack + 16)
752B      STRXui %34:gpr64, $sp, 1 :: (store (s64) into stack + 8)
784B      STRXui %36:gpr64, $sp, 0 :: (store (s64) into stack)
800B      %38:gpr64common = MOVaddr target-flags(aarch64-page) @.str.2, target-flags(aarch64-pageoff, aarch64-nc) @.str.2
816B      $x0 = COPY %38:gpr64common
832B      BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
848B      ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
880B      undef %43.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
912B      undef %45.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
944B      ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
976B      STRXui %40:gpr64, $sp, 2 :: (store (s64) into stack + 16)
992B      STRXui %45:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1008B     STRXui %43:gpr64, $sp, 0 :: (store (s64) into stack)
1024B     %47:gpr64common = MOVaddr target-flags(aarch64-page) @.str.3, target-flags(aarch64-pageoff, aarch64-nc) @.str.3
1040B     $x0 = COPY %47:gpr64common
1056B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1072B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1104B     undef %52.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
1136B     undef %54.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
1168B     ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
1200B     STRXui %49:gpr64, $sp, 2 :: (store (s64) into stack + 16)
1216B     STRXui %54:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1232B     STRXui %52:gpr64, $sp, 0 :: (store (s64) into stack)
1248B     %56:gpr64common = MOVaddr target-flags(aarch64-page) @.str.4, target-flags(aarch64-pageoff, aarch64-nc) @.str.4
1264B     $x0 = COPY %56:gpr64common
1280B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1296B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1312B     undef %62.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
1328B     CBZW %62.sub_32:gpr64, %bb.5
1344B     B %bb.4

1360B   bb.4.if.then:
        ; predecessors: %bb.3
          successors: %bb.6(0x80000000); %bb.6(100.00%)

1376B     undef %60.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
1408B     ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
1440B     STRDui %78:fpr64, $sp, 2 :: (store (s64) into stack + 16)
1472B     STRXui %62:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1488B     STRXui %60:gpr64, $sp, 0 :: (store (s64) into stack)
1504B     %64:gpr64common = MOVaddr target-flags(aarch64-page) @.str.5, target-flags(aarch64-pageoff, aarch64-nc) @.str.5
1520B     $x0 = COPY %64:gpr64common
1536B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1552B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1584B     undef %69.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
1616B     undef %71.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
1648B     ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
1680B     STRXui %66:gpr64, $sp, 2 :: (store (s64) into stack + 16)
1696B     STRXui %71:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1712B     STRXui %69:gpr64, $sp, 0 :: (store (s64) into stack)
1728B     %73:gpr64common = MOVaddr target-flags(aarch64-page) @.str.6, target-flags(aarch64-pageoff, aarch64-nc) @.str.6
1744B     $x0 = COPY %73:gpr64common
1760B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1776B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1792B     B %bb.6

1808B   bb.5.if.else:
        ; predecessors: %bb.3
          successors: %bb.6(0x80000000); %bb.6(100.00%)

1824B     ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
1840B     %75:gpr64common = MOVaddr target-flags(aarch64-page) @str, target-flags(aarch64-pageoff, aarch64-nc) @str
1856B     $x0 = COPY %75:gpr64common
1872B     BL @puts, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1888B     ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp

1904B   bb.6.if.end:
        ; predecessors: %bb.4, %bb.5

1936B     $w0 = COPY $wzr
1952B     RET_ReallyLR implicit killed $w0

# End machine code for function main.

========= After MSched
********** INTERVALS **********
WSP [16r,16d:23)[64r,64d:22)[80r,80d:21)[96r,96d:20)[224r,224d:19)[240r,240d:18)[688r,688d:17)[832r,832d:16)[848r,848d:15)[944r,944d:14)[1056r,1056d:13)[1072r,1072d:12)[1168r,1168d:11)[1280r,1280d:10)[1296r,1296d:9)[1408r,1408d:8)[1536r,1536d:7)[1552r,1552d:6)[1648r,1648d:5)[1760r,1760d:4)[1776r,1776d:3)[1824r,1824d:2)[1872r,1872d:1)[1888r,1888d:0) 0@1888r 1@1872r 2@1824r 3@1776r 4@1760r 5@1648r 6@1552r 7@1536r 8@1408r 9@1296r 10@1280r 11@1168r 12@1072r 13@1056r 14@944r 15@848r 16@832r 17@688r 18@240r 19@224r 20@96r 21@80r 22@64r 23@16r
%12 [32r,48r:0) 0@32r  weight:0.000000e+00
%15 [128r,168r:0) 0@128r  weight:0.000000e+00
%16 [160r,176r:0) 0@160r  weight:0.000000e+00
%17 [192r,208r:0) 0@192r  weight:0.000000e+00
%25 [480r,512r:0) 0@480r  weight:0.000000e+00
%26 [496r,512r:0) 0@496r  weight:0.000000e+00
%27 [512r,528r:0) 0@512r  weight:0.000000e+00
%28 [544r,560r:0) 0@544r  weight:0.000000e+00
%31 [288r,720r:0) 0@288r  weight:0.000000e+00
%34 [272r,752r:0) 0@272r  weight:0.000000e+00
%36 [256r,784r:0) 0@256r  weight:0.000000e+00
%38 [800r,816r:0) 0@800r  weight:0.000000e+00
%40 [304r,976r:0) 0@304r  weight:0.000000e+00
%43 [880r,1008r:0) 0@880r  weight:0.000000e+00
%45 [912r,992r:0) 0@912r  weight:0.000000e+00
%47 [1024r,1040r:0) 0@1024r  weight:0.000000e+00
%49 [320r,1200r:0) 0@320r  weight:0.000000e+00
%52 [1104r,1232r:0) 0@1104r  weight:0.000000e+00
%54 [1136r,1216r:0) 0@1136r  weight:0.000000e+00
%56 [1248r,1264r:0) 0@1248r  weight:0.000000e+00
%60 [1376r,1488r:0) 0@1376r  weight:0.000000e+00
%62 [1312r,1472r:0) 0@1312r  weight:0.000000e+00
%64 [1504r,1520r:0) 0@1504r  weight:0.000000e+00
%66 [360r,464B:1)[560r,624B:2)[624B,1680r:0) 0@624B-phi 1@360r 2@560r  weight:0.000000e+00
%69 [1584r,1712r:0) 0@1584r  weight:0.000000e+00
%71 [1616r,1696r:0) 0@1616r  weight:0.000000e+00
%73 [1728r,1744r:0) 0@1728r  weight:0.000000e+00
%75 [1840r,1856r:0) 0@1840r  weight:0.000000e+00
%78 [400r,464B:0)[528r,624B:1)[624B,1440r:2) 0@400r 1@528r 2@624B-phi  weight:0.000000e+00
RegMasks: 64r 224r 832r 1056r 1280r 1536r 1760r 1872r
********** MACHINEINSTRS **********
# Machine code for function main: NoPHIs, TracksLiveness, TiedOpsRewritten
Frame Objects:
  fi#0: size=4, align=4, at location [SP]
  fi#1: size=4, align=4, at location [SP]

0B      bb.0.entry:
          successors: %bb.1(0x30000000), %bb.2(0x50000000); %bb.1(37.50%), %bb.2(62.50%)

16B       ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
32B       %12:gpr64common = MOVaddr target-flags(aarch64-page) @.str, target-flags(aarch64-pageoff, aarch64-nc) @.str
48B       $x0 = COPY %12:gpr64common
64B       BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
80B       ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
96B       ADJCALLSTACKDOWN 16, 0, implicit-def dead $sp, implicit $sp
128B      %15:gpr64common = ADDXri %stack.1.b, 0, 0
160B      %16:gpr64common = ADDXri %stack.0.a, 0, 0
168B      STRXui %15:gpr64common, $sp, 1 :: (store (s64) into stack + 8)
176B      STRXui %16:gpr64common, $sp, 0 :: (store (s64) into stack)
192B      %17:gpr64common = MOVaddr target-flags(aarch64-page) @.str.1, target-flags(aarch64-pageoff, aarch64-nc) @.str.1
208B      $x0 = COPY %17:gpr64common
224B      BL @scanf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
240B      ADJCALLSTACKUP 16, 0, implicit-def dead $sp, implicit $sp
256B      undef %36.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
272B      undef %34.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
288B      undef %31.sub_32:gpr64 = nsw ADDWrr %34.sub_32:gpr64, %36.sub_32:gpr64
304B      undef %40.sub_32:gpr64 = nsw SUBWrr %36.sub_32:gpr64, %34.sub_32:gpr64
320B      undef %49.sub_32:gpr64 = nsw MADDWrrr %34.sub_32:gpr64, %36.sub_32:gpr64, $wzr
336B      CBNZW %34.sub_32:gpr64, %bb.2

352B    bb.1:
        ; predecessors: %bb.0
          successors: %bb.3(0x80000000); %bb.3(100.00%)

360B      undef %66.sub_32:gpr64 = COPY $wzr
400B      %78:fpr64 = FMOVD0
448B      B %bb.3

464B    bb.2.cond.true5:
        ; predecessors: %bb.0
          successors: %bb.3(0x80000000); %bb.3(100.00%)

480B      %25:fpr32 = nofpexcept SCVTFUWSri %36.sub_32:gpr64, implicit $fpcr
496B      %26:fpr32 = nofpexcept SCVTFUWSri %34.sub_32:gpr64, implicit $fpcr
512B      %27:fpr32 = nofpexcept FDIVSrr %25:fpr32, %26:fpr32, implicit $fpcr
528B      %78:fpr64 = nofpexcept FCVTDSr %27:fpr32, implicit $fpcr
544B      %28:gpr32 = SDIVWr %36.sub_32:gpr64, %34.sub_32:gpr64
560B      undef %66.sub_32:gpr64 = MSUBWrrr %28:gpr32, %34.sub_32:gpr64, %36.sub_32:gpr64

624B    bb.3.cond.end7:
        ; predecessors: %bb.2, %bb.1
          successors: %bb.5(0x30000000), %bb.4(0x50000000); %bb.5(37.50%), %bb.4(62.50%)

688B      ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
720B      STRXui %31:gpr64, $sp, 2 :: (store (s64) into stack + 16)
752B      STRXui %34:gpr64, $sp, 1 :: (store (s64) into stack + 8)
784B      STRXui %36:gpr64, $sp, 0 :: (store (s64) into stack)
800B      %38:gpr64common = MOVaddr target-flags(aarch64-page) @.str.2, target-flags(aarch64-pageoff, aarch64-nc) @.str.2
816B      $x0 = COPY %38:gpr64common
832B      BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
848B      ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
880B      undef %43.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
912B      undef %45.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
944B      ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
976B      STRXui %40:gpr64, $sp, 2 :: (store (s64) into stack + 16)
992B      STRXui %45:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1008B     STRXui %43:gpr64, $sp, 0 :: (store (s64) into stack)
1024B     %47:gpr64common = MOVaddr target-flags(aarch64-page) @.str.3, target-flags(aarch64-pageoff, aarch64-nc) @.str.3
1040B     $x0 = COPY %47:gpr64common
1056B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1072B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1104B     undef %52.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
1136B     undef %54.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
1168B     ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
1200B     STRXui %49:gpr64, $sp, 2 :: (store (s64) into stack + 16)
1216B     STRXui %54:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1232B     STRXui %52:gpr64, $sp, 0 :: (store (s64) into stack)
1248B     %56:gpr64common = MOVaddr target-flags(aarch64-page) @.str.4, target-flags(aarch64-pageoff, aarch64-nc) @.str.4
1264B     $x0 = COPY %56:gpr64common
1280B     BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
1296B     ADJCALLSTACKUP 24, 0, implicit-def dead $sp, implicit $sp
1312B     undef %62.sub_32:gpr64 = LDRWui %stack.1.b, 0 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
1328B     CBZW %62.sub_32:gpr64, %bb.5
1344B     B %bb.4

1360B   bb.4.if.then:
        ; predecessors: %bb.3
          successors: %bb.6(0x80000000); %bb.6(100.00%)

1376B     undef %60.sub_32:gpr64 = LDRWui %stack.0.a, 0 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
1408B     ADJCALLSTACKDOWN 24, 0, implicit-def dead $sp, implicit $sp
1440B     STRDui %78:fpr64, $sp, 2 :: (store (s64) into stack + 16)
1472B     STRXui %62:gpr64, $sp, 1 :: (store (s64) into stack + 8)
1488B     STRXui %60:gpr64, $sp, 0 :: (store (s64) into stack)
1504B     %64:gpr64common = MOVaddr target-flags

ScheduleDAGMI (Post RA)

=========Before post-MI-sched:
# Machine code for function main: NoPHIs, TracksLiveness, NoVRegs, TiedOpsRewritten, TracksDebugUserValues
Frame Objects:
  fi#0: size=4, align=4, at location [SP-68]
  fi#1: size=4, align=4, at location [SP-72]
  fi#2: size=8, align=8, at location [SP-8]
  fi#3: size=8, align=8, at location [SP-16]
  fi#4: size=8, align=8, at location [SP-24]
  fi#5: size=8, align=8, at location [SP-32]
  fi#6: size=8, align=8, at location [SP-40]
  fi#7: size=8, align=8, at location [SP-48]
  fi#8: size=8, align=8, at location [SP-56]
  fi#9: size=8, align=8, at location [SP-64]

bb.0.entry:
  successors: %bb.1(0x30000000), %bb.2(0x50000000); %bb.1(37.50%), %bb.2(62.50%)
  liveins: $d8, $d9, $x21, $x22, $x19, $x20, $lr
  $sp = frame-setup SUBXri $sp, 96, 0
  frame-setup STPDi killed $d9, killed $d8, $sp, 4 :: (store (s64) into %stack.9), (store (s64) into %stack.8)
  frame-setup STPXi killed $x22, killed $x21, $sp, 6 :: (store (s64) into %stack.7), (store (s64) into %stack.6)
  frame-setup STPXi killed $x20, killed $x19, $sp, 8 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
  frame-setup STPXi killed $fp, killed $lr, $sp, 10 :: (store (s64) into %stack.3), (store (s64) into %stack.2)
  $fp = frame-setup ADDXri $sp, 80, 0
  frame-setup CFI_INSTRUCTION def_cfa $w29, 16
  frame-setup CFI_INSTRUCTION offset $w30, -8
  frame-setup CFI_INSTRUCTION offset $w29, -16
  frame-setup CFI_INSTRUCTION offset $w19, -24
  frame-setup CFI_INSTRUCTION offset $w20, -32
  frame-setup CFI_INSTRUCTION offset $w21, -40
  frame-setup CFI_INSTRUCTION offset $w22, -48
  frame-setup CFI_INSTRUCTION offset $b8, -56
  frame-setup CFI_INSTRUCTION offset $b9, -64
  $x0 = ADRP target-flags(aarch64-page) @.str
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  $x8 = ADDXri $sp, 24, 0
  $x9 = ADDXri $sp, 28, 0
  STPXi killed renamable $x9, killed renamable $x8, $sp, 0 :: (store (s64) into stack + 8), (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.1
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.1, 0
  BL @scanf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  $w10 = ADDWrs renamable $w9, renamable $w8, 0, implicit-def $x10
  $w21 = SUBWrs renamable $w8, renamable $w9, 0, implicit-def $x21
  renamable $w20 = nsw MADDWrrr renamable $w9, renamable $w8, $wzr, implicit-def $x20
  CBNZW renamable $w9, %bb.2

bb.1:
; predecessors: %bb.0
  successors: %bb.3(0x80000000); %bb.3(100.00%)
  liveins: $x8, $x9, $x10, $x20, $x21
  $w19 = MOVZWi 0, 0, implicit-def $x19
  renamable $d8 = FMOVD0
  B %bb.3

bb.2.cond.true5:
; predecessors: %bb.0
  successors: %bb.3(0x80000000); %bb.3(100.00%)
  liveins: $x8, $x9, $x10, $x20, $x21
  renamable $s0 = nofpexcept SCVTFUWSri renamable $w8, implicit $fpcr
  renamable $s1 = nofpexcept SCVTFUWSri renamable $w9, implicit $fpcr
  renamable $s0 = nofpexcept FDIVSrr killed renamable $s0, killed renamable $s1, implicit $fpcr
  renamable $d8 = nofpexcept FCVTDSr killed renamable $s0, implicit $fpcr
  renamable $w11 = SDIVWr renamable $w8, renamable $w9
  renamable $w19 = MSUBWrrr killed renamable $w11, renamable $w9, renamable $w8, implicit-def $x19

bb.3.cond.end7:
; predecessors: %bb.2, %bb.1
  successors: %bb.5(0x30000000), %bb.4(0x50000000); %bb.5(37.50%), %bb.4(62.50%)
  liveins: $d8, $x8, $x9, $x10, $x19, $x20, $x21
  STPXi killed renamable $x9, killed renamable $x10, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.2
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.2, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  STPXi killed renamable $x9, killed renamable $x21, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.3
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.3, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  STPXi killed renamable $x9, killed renamable $x20, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.4
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.4, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w8 = LDRWui $sp, 6, implicit-def $x8 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
  CBZW renamable $w8, %bb.5

bb.4.if.then:
; predecessors: %bb.3
  successors: %bb.6(0x80000000); %bb.6(100.00%)
  liveins: $d8, $x8, $x19
  renamable $w9 = LDRWui $sp, 7, implicit-def $x9 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
  STRDui killed renamable $d8, $sp, 2 :: (store (s64) into stack + 16)
  STPXi killed renamable $x9, killed renamable $x8, $sp, 0 :: (store (s64) into stack + 8), (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.5
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.5, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  STPXi killed renamable $x9, killed renamable $x19, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  $x0 = ADRP target-flags(aarch64-page) @.str.6
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.6, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0
  B %bb.6

bb.5.if.else:
; predecessors: %bb.3
  successors: %bb.6(0x80000000); %bb.6(100.00%)

  $x0 = ADRP target-flags(aarch64-page) @str
  renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @str, 0
  BL @puts, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def dead $w0

bb.6.if.end:
; predecessors: %bb.4, %bb.5

  $w0 = MOVZWi 0, 0
  $fp, $lr = frame-destroy LDPXi $sp, 10 :: (load (s64) from %stack.3), (load (s64) from %stack.2)
  $x20, $x19 = frame-destroy LDPXi $sp, 8 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
  $x22, $x21 = frame-destroy LDPXi $sp, 6 :: (load (s64) from %stack.7), (load (s64) from %stack.6)
  $d9, $d8 = frame-destroy LDPDi $sp, 4 :: (load (s64) from %stack.9), (load (s64) from %stack.8)
  $sp = frame-destroy ADDXri $sp, 96, 0
  RET undef $lr, implicit $w0

# End machine code for function main.

========After post-MI-sched:
# Machine code for function main: NoPHIs, TracksLiveness, NoVRegs, TiedOpsRewritten, TracksDebugUserValues
Frame Objects:
  fi#0: size=4, align=4, at location [SP-68]
  fi#1: size=4, align=4, at location [SP-72]
  fi#2: size=8, align=8, at location [SP-8]
  fi#3: size=8, align=8, at location [SP-16]
  fi#4: size=8, align=8, at location [SP-24]
  fi#5: size=8, align=8, at location [SP-32]
  fi#6: size=8, align=8, at location [SP-40]
  fi#7: size=8, align=8, at location [SP-48]
  fi#8: size=8, align=8, at location [SP-56]
  fi#9: size=8, align=8, at location [SP-64]

bb.0.entry:
  successors: %bb.1(0x30000000), %bb.2(0x50000000); %bb.1(37.50%), %bb.2(62.50%)
  liveins: $d8, $d9, $x21, $x22, $x19, $x20, $lr
  $sp = frame-setup SUBXri $sp, 96, 0
  frame-setup STPDi killed $d9, killed $d8, $sp, 4 :: (store (s64) into %stack.9), (store (s64) into %stack.8)
  frame-setup STPXi killed $x22, killed $x21, $sp, 6 :: (store (s64) into %stack.7), (store (s64) into %stack.6)
  frame-setup STPXi killed $x20, killed $x19, $sp, 8 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
  frame-setup STPXi $fp, killed $lr, $sp, 10 :: (store (s64) into %stack.3), (store (s64) into %stack.2)
  $fp = frame-setup ADDXri $sp, 80, 0
  frame-setup CFI_INSTRUCTION def_cfa $w29, 16
  frame-setup CFI_INSTRUCTION offset $w30, -8
  frame-setup CFI_INSTRUCTION offset $w29, -16
  frame-setup CFI_INSTRUCTION offset $w19, -24
  frame-setup CFI_INSTRUCTION offset $w20, -32
  frame-setup CFI_INSTRUCTION offset $w21, -40
  frame-setup CFI_INSTRUCTION offset $w22, -48
  frame-setup CFI_INSTRUCTION offset $b8, -56
  frame-setup CFI_INSTRUCTION offset $b9, -64
  $x0 = ADRP target-flags(aarch64-page) @.str
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  $x0 = ADRP target-flags(aarch64-page) @.str.1
  $x8 = ADDXri $sp, 24, 0
  $x9 = ADDXri $sp, 28, 0
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.1, 0
  STPXi killed renamable $x9, killed renamable $x8, $sp, 0 :: (store (s64) into stack + 8), (store (s64) into stack)
  BL @scanf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  renamable $w20 = nsw MADDWrrr renamable $w9, renamable $w8, $wzr, implicit-def $x20
  $w10 = ADDWrs renamable $w9, renamable $w8, 0, implicit-def $x10
  $w21 = SUBWrs renamable $w8, renamable $w9, 0, implicit-def $x21
  CBNZW renamable $w9, %bb.2

bb.1:
; predecessors: %bb.0
  successors: %bb.3(0x80000000); %bb.3(100.00%)
  liveins: $x8, $x9, $x10, $x20, $x21
  $w19 = MOVZWi 0, 0, implicit-def $x19
  renamable $d8 = FMOVD0
  B %bb.3

bb.2.cond.true5:
; predecessors: %bb.0
  successors: %bb.3(0x80000000); %bb.3(100.00%)
  liveins: $x8, $x9, $x10, $x20, $x21
  renamable $s0 = nofpexcept SCVTFUWSri renamable $w8, implicit $fpcr
  renamable $s1 = nofpexcept SCVTFUWSri renamable $w9, implicit $fpcr
  renamable $w11 = SDIVWr renamable $w8, renamable $w9
  renamable $s0 = nofpexcept FDIVSrr killed renamable $s0, killed renamable $s1, implicit $fpcr
  renamable $w19 = MSUBWrrr killed renamable $w11, renamable $w9, renamable $w8, implicit-def $x19
  renamable $d8 = nofpexcept FCVTDSr killed renamable $s0, implicit $fpcr

bb.3.cond.end7:
; predecessors: %bb.2, %bb.1
  successors: %bb.5(0x30000000), %bb.4(0x50000000); %bb.5(37.50%), %bb.4(62.50%)
  liveins: $d8, $x8, $x9, $x10, $x19, $x20, $x21
  $x0 = ADRP target-flags(aarch64-page) @.str.2
  STPXi killed renamable $x9, killed renamable $x10, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.2, 0
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  $x0 = ADRP target-flags(aarch64-page) @.str.3
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.3, 0
  STPXi killed renamable $x9, killed renamable $x21, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  $x0 = ADRP target-flags(aarch64-page) @.str.4
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.4, 0
  STPXi killed renamable $x9, killed renamable $x20, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w8 = LDRWui $sp, 6, implicit-def $x8 :: (dereferenceable load (s32) from %ir.b, !tbaa !6)
  CBZW renamable $w8, %bb.5

bb.4.if.then:
; predecessors: %bb.3
  successors: %bb.6(0x80000000); %bb.6(100.00%)
  liveins: $d8, $x8, $x19
  renamable $w9 = LDRWui $sp, 7, implicit-def $x9 :: (dereferenceable load (s32) from %ir.a, !tbaa !6)
  $x0 = ADRP target-flags(aarch64-page) @.str.5
  STRDui killed renamable $d8, $sp, 2 :: (store (s64) into stack + 16)
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.5, 0
  STPXi killed renamable $x9, killed renamable $x8, $sp, 0 :: (store (s64) into stack + 8), (store (s64) into stack)
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  renamable $w9, renamable $w8 = LDPWi $sp, 6 :: (dereferenceable load (s32) from %ir.a, !tbaa !6), (dereferenceable load (s32) from %ir.b, !tbaa !6)
  $x0 = ADRP target-flags(aarch64-page) @.str.6
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @.str.6, 0
  STPXi killed renamable $x9, killed renamable $x19, $sp, 1 :: (store (s64) into stack + 16), (store (s64) into stack + 8)
  STRXui killed renamable $x8, $sp, 0 :: (store (s64) into stack)
  BL @printf, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0
  B %bb.6

bb.5.if.else:
; predecessors: %bb.3
  successors: %bb.6(0x80000000); %bb.6(100.00%)

  $x0 = ADRP target-flags(aarch64-page) @str
  renamable $x0 = ADDXri killed $x0, target-flags(aarch64-pageoff, aarch64-nc) @str, 0
  BL @puts, <regmask $fp $lr $wzr $xzr $b8 $b9 $b10 $b11 $b12 $b13 $b14 $b15 $d8 $d9 $d10 $d11 $d12 $d13 $d14 $d15 $h8 $h9 $h10 $h11 $h12 $h13 $h14 $h15 $s8 $s9 $s10 $s11 $s12 and 55 more...>, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, implicit-def dead $w0

bb.6.if.end:
; predecessors: %bb.4, %bb.5

  $fp, $lr = frame-destroy LDPXi $sp, 10 :: (load (s64) from %stack.3), (load (s64) from %stack.2)
  $x20, $x19 = frame-destroy LDPXi $sp, 8 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
  $w0 = MOVZWi 0, 0
  $x22, $x21 = frame-destroy LDPXi $sp, 6 :: (load (s64) from %stack.7), (load (s64) from %stack.6)
  $d9, $d8 = frame-destroy LDPDi $sp, 4 :: (load (s64) from %stack.9), (load (s64) from %stack.8)
  $sp = frame-destroy ADDXri $sp, 96, 0
  RET undef $lr, implicit killed $w0

# End machine code for function main.

PostRASchedulerList (SchedulePostRATDList)

暂时没法打印出日志

参考文献

同一维度基地址是固定跨步,但是不同维度如果重新进行一次 base 的计算,会失去静态数组的优势。
malloc 的则可能需要重新计算 base 或 offset。
ld xxx, offset (sp) 是典型的溢出语句。
lw、ld 的 latency 一致。
小 kernel 不能加 call 调用。