clean up

JacobSzwejbka · JacobSzwejbka · commit bfd7f369f681 · 2025-12-05T11:05:11.000-08:00
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -993,19 +993,14 @@ def _emit_scan(
         num_carry = len(init)
         num_xs = len(xs)
 
-        # Split output values into carry outputs and y outputs
         carry_outputs = list(subemitter_binding_output_values[:num_carry])
         y_outputs = list(subemitter_binding_output_values[num_carry:])
 
         if num_xs < 1:
             raise RuntimeError("Scan requires at least one xs tensor to scan over.")
 
-        # === INITIALIZATION ===
-
-        # Generate iterator index EValue
         iter_idx = self._emit_evalue(EValue(Int(0)))
 
-        # Get scan length from first xs tensor
         op_index, op = self._get_operator(
             name="aten::sym_size",
             overload="int",
@@ -1019,9 +1014,7 @@ def _emit_scan(
         )
         self.chain.instructions.append(kernel)
 
-        # Initialize carry_outputs from init by copying init -> carry_outputs
-        # This is necessary because we shouldn't mutate the original init tensors
-        # Use aten::copy_.default which copies src to self in-place
+        # Initialize carry_outputs from init
         op_index_copy, _ = self._get_operator(name="aten::copy_")
         for init_val, carry_out in zip(init, carry_outputs):
             kernel = Instruction(
@@ -1037,11 +1030,7 @@ def _emit_scan(
             )
             self.chain.instructions.append(kernel)
 
-        # === LOOP START ===
-
         # Slice each xs tensor for the current iteration
-        # We use -1 as placeholder for the output tensor id, which will be filled
-        # after the scan_emitter runs and allocates the input placeholder EValues
         op_index_select, _ = self._get_operator(
             name="aten::select_copy",
             overload="int_out",
@@ -1053,69 +1042,46 @@ def _emit_scan(
                     op_index=op_index_select,
                     args=[
                         x.id,
-                        self._emit_evalue(EValue(Int(0))).id,  # dim=0
+                        self._emit_evalue(EValue(Int(0))).id,
                         iter_idx.id,
-                        -1,  # placeholder for output tensor id
-                        -1,  # placeholder (repeated for out variant)
+                        -1,
+                        -1,
                     ],
                 )
             )
             xs_slice_instructions.append(kernel)
 
-        # Store jump target - this is where we jump back to after each iteration
         jump_to_instruction = self.instruction_start_offset + len(
             self.chain.instructions
         )
 
-        # Add all xs slice instructions
         for kernel in xs_slice_instructions:
             self.chain.instructions.append(kernel)
 
-        # === EMIT COMBINE_FN SUBMODULE ===
-
-        # combine_fn inputs: (*carry, *xs_slice, *additional_inputs)
-        # We bind carry inputs to carry_outputs (the working carry buffers)
-        # xs_slice inputs will be filled in after emitter runs (using -1 placeholder)
-        # additional_inputs are passed through directly
+        # Emit combine_fn submodule
         binding_input_values: List[Any] = []
-        binding_input_values.extend(
-            carry_outputs
-        )  # Carry inputs bound to carry_outputs
-        binding_input_values.extend([-1] * num_xs)  # Placeholders for xs slices
-        binding_input_values.extend(additional_inputs)  # Additional inputs
-
-        # combine_fn outputs: (*next_carry, *y_slice)
-        # Pass binding_output_values=None so the combine_fn writes directly to its
-        # own output buffers (concrete_output_ids). We then copy from these directly
-        # to the final carry/y buffers, avoiding unnecessary temp buffers and MOVEs.
+        binding_input_values.extend(carry_outputs)
+        binding_input_values.extend([-1] * num_xs)
+        binding_input_values.extend(additional_inputs)
+
         scan_emitter = _Emitter(
             combine_fn,
             self.emitter_state,
             self.program_state,
             instruction_start_offset=self.instruction_start_offset
             + len(self.chain.instructions),
             binding_input_values=binding_input_values,
-            binding_output_values=None,  # Use concrete outputs directly
+            binding_output_values=None,
         )
         scan_emitter.run()
 
-        # Merge combine_fn instructions
         self._merge_chain(scan_emitter.chain)
-        # NOTE: When binding_output_values=None, no return/move instruction is added
-        # by the output() method, so we don't need to pop anything.
 
-        # Update xs_slice instructions with the actual placeholder EValue ids
-        # The xs placeholders start after the carry inputs in combine_fn
         for i, kernel in enumerate(xs_slice_instructions):
             xs_placeholder_id = scan_emitter.binding_input_values[num_carry + i].id
             kernel.instr_args.args[-1] = xs_placeholder_id
             kernel.instr_args.args[-2] = xs_placeholder_id
 
-        # === COPY OUTPUTS ===
-
-        # Get combine_fn's actual output EValues
-        # concrete_output_ids contains the actual EValues that the combine_fn
-        # graph operations write to: (*carry_temp, *y_temp)
         concrete_outputs = scan_emitter.concrete_output_ids
         carry_temp = concrete_outputs[:num_carry]
         y_temp = concrete_outputs[num_carry:]
@@ -1132,8 +1098,6 @@ def _emit_scan(
         )
 
         # Copy carry_temp -> carry_outputs for next iteration
-        # This explicit copy is required because in-place op.out(x, out=x) is unsafe
-        # aten::copy_ signature: (self, src, non_blocking, out) -> self
         for carry_t, carry_out in zip(carry_temp, carry_outputs):
             kernel = Instruction(
                 KernelCall(
@@ -1148,7 +1112,7 @@ def _emit_scan(
             )
             self.chain.instructions.append(kernel)
 
-        # Copy y_temp to stacked y_outputs using et_copy_index
+        # Copy y_temp to stacked y_outputs
         op_index_copy_index, _ = self._get_operator(
             name="executorch_prim::et_copy_index",
             overload="tensor",
@@ -1162,8 +1126,6 @@ def _emit_scan(
             )
             self.chain.instructions.append(kernel)
 
-        # === LOOP CONTROL ===
-
         # Increment iter_idx
         op_index_add, _ = self._get_operator(
             name="executorch_prim::add",
@@ -1191,7 +1153,6 @@ def _emit_scan(
         )
         self.chain.instructions.append(kernel)
 
-        # Jump back to loop start if not done
         jf_beginning_loop = Instruction(
             JumpFalseCall(
                 cond_value_index=jump_bool_value.id,
@@ -1200,9 +1161,7 @@ def _emit_scan(
         )
         self.chain.instructions.append(jf_beginning_loop)
 
-        # === CLEANUP ===
-
-        # Reset iter_idx for potential re-runs of the model
+        # Reset iter_idx for potential re-runs
         op_index_sub, _ = self._get_operator(
             name="executorch_prim::sub",
             overload="Scalar",
diff --git a/exir/memory_planning.py b/exir/memory_planning.py
@@ -1024,11 +1024,6 @@ def get_map_nodes(graph_module: torch.fx.GraphModule) -> Iterable[Node]:
 
 
 def get_scan_nodes(graph_module: torch.fx.GraphModule) -> Iterable[Node]:
-    """Get all scan nodes in the graph module.
-
-    Scan nodes have the signature: scan(combine_fn, init, xs, additional_inputs)
-    where combine_fn is a submodule at args[0].
-    """
     for nd in graph_module.graph.nodes:
         if nd.target is torch.ops.higher_order.scan:
             yield nd
@@ -1172,12 +1167,6 @@ def _handle(
     for map_node in get_map_nodes(graph_module):
         _handle(cast(torch.fx.Node, map_node.args[0]), alloc_graph_input=True)
 
-    # Handle scan nodes
-    # Scan signature: scan(combine_fn, init, xs, additional_inputs)
-    # combine_fn is at args[0]
-    # Like map, scan needs alloc_graph_input=True because the runtime slices
-    # xs tensors during each iteration, requiring allocated input buffers.
-    # Additionally, scan has carry state that flows between iterations.
     for scan_node in get_scan_nodes(graph_module):
         _handle(cast(torch.fx.Node, scan_node.args[0]), alloc_graph_input=True)
 
diff --git a/exir/pass_base.py b/exir/pass_base.py
@@ -558,29 +558,10 @@ def call_scan(
         additional_inputs: List[ProxyValue],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        """
-        Process a scan higher-order operation.
-
-        Scan applies combine_fn iteratively, carrying state across iterations:
-            combine_fn(carry, x_slice) -> (next_carry, y_slice)
-
-        Args:
-            combine_fn: GraphModule implementing the scan body
-            init: Initial carry state values
-            xs: Input tensors to scan over (along dim 0)
-            additional_inputs: Additional arguments passed to combine_fn
-            meta: Node metadata
-
-        Returns:
-            ProxyValue containing (final_carry, stacked_outputs)
-        """
-        # Get the first slice of xs to determine input shapes for combine_fn
-        # combine_fn inputs: (*init, *xs_slice, *additional_inputs)
         xs_first_slice = _unstack_pytree([arg.data for arg in xs])[0]
         init_data = [arg.data for arg in init]
         additional_data = [arg.data for arg in additional_inputs]
 
-        # Call submodule with representative inputs
         combine_fn_result = self.call_submodule(
             combine_fn, tuple(init_data + xs_first_slice + additional_data)
         )
diff --git a/exir/passes/spec_prop_pass.py b/exir/passes/spec_prop_pass.py
@@ -150,61 +150,23 @@ def call_scan(
         additional_inputs: List[ProxyValue],
         meta: NodeMetadata,
     ) -> ProxyValue:
-        """
-        Propagate specs for scan higher-order operation.
-
-        Scan returns (final_carry, stacked_outputs) where:
-        - final_carry: Same shape as init (NOT stacked, just the final carry state)
-        - stacked_outputs: Outputs stacked along dim 0 with scan_length
-
-        The combine_fn signature is:
-            combine_fn(*init, *xs_slice, *additional_inputs) -> (*next_carry, *y_slice)
-
-        So the combine_fn outputs are split into:
-        - First len(init) outputs: carry values (same shape as init)
-        - Remaining outputs: y values (to be stacked)
-
-        Memory Layout Note:
-        The specs created here are for the FINAL outputs of the scan operation:
-        - carry specs: Working carry buffers that persist across iterations.
-          These are SEPARATE from combine_fn's output buffers. The emitter
-          must copy from combine_fn's temporary carry output to these buffers
-          after each iteration (in-place op.out(x, out=x) is unsafe).
-        - y specs: Pre-allocated stacked buffers filled via et_copy_index.
-
-        The combine_fn's internal temporary buffers are allocated separately
-        via memory planning with alloc_graph_input=True, alloc_graph_output=True.
-        """
-        # Get scan length from first xs tensor
         scan_length = [arg.data for arg in xs][0].size(0)
 
-        # Get the output node from combine_fn
         *_, body_out_node = combine_fn.graph.nodes
         body_out_fake = body_out_node.meta["val"]
 
-        # The combine_fn outputs are: (*next_carry, *y_slice)
-        # Split them based on the number of init values
         num_carry = len(init)
-
-        # Flatten the outputs to handle them uniformly
         flat_body_out, out_spec = pytree.tree_flatten(body_out_fake)
 
-        # Split into carry outputs and y outputs
         carry_out = flat_body_out[:num_carry]
         y_out = flat_body_out[num_carry:]
 
-        # Create specs:
-        # - Carry: same shape as combine_fn output (NOT stacked)
-        #   These are working buffers that get updated each iteration
-        # - Y: stacked along dim 0 with scan_length
-        carry_fake = carry_out  # Carry keeps same shape
-
+        carry_fake = carry_out
         y_fake = [
             x.new_empty(scan_length, *x.shape) if isinstance(x, torch.Tensor) else x
             for x in y_out
         ]
 
-        # Combine carry and stacked y outputs
         combined_fake = carry_fake + y_fake
 
         meta["spec"] = pytree.tree_map(make_spec, combined_fake)
diff --git a/exir/tests/control_flow_models.py b/exir/tests/control_flow_models.py
@@ -158,7 +158,6 @@ def forward(
         self, xs: torch.Tensor, scale: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor]:
         def combine_fn(carry, x):
-            # Scale is captured from outer scope
             new_carry = carry + x * scale
             return new_carry, new_carry.clone()