diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
index b2f27195a8..93ca7b298c 100644
--- a/deepmd/dpmodel/descriptor/dpa3.py
+++ b/deepmd/dpmodel/descriptor/dpa3.py
@@ -271,6 +271,9 @@ class DescrptDPA3(NativeOP, BaseDescriptor):
         Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
         Whether to use bias in the type embedding layer.
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     type_map : list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
@@ -290,6 +293,7 @@ def __init__(
         seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
+        use_loc_mapping: bool = True,
         type_map: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
@@ -335,6 +339,7 @@ def init_subclass_params(sub_data, sub_class):
             use_exp_switch=self.repflow_args.use_exp_switch,
             use_dynamic_sel=self.repflow_args.use_dynamic_sel,
             sel_reduce_factor=self.repflow_args.sel_reduce_factor,
+            use_loc_mapping=use_loc_mapping,
             exclude_types=exclude_types,
             env_protection=env_protection,
             precision=precision,
@@ -343,6 +348,7 @@ def init_subclass_params(sub_data, sub_class):
 
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
+        self.use_loc_mapping = use_loc_mapping
         self.type_map = type_map
         self.tebd_dim = self.repflow_args.n_dim
         self.type_embedding = TypeEmbedNet(
@@ -541,10 +547,16 @@ def call(
         nall = xp.reshape(coord_ext, (nframes, -1)).shape[1] // 3
 
         type_embedding = self.type_embedding.call()
-        node_ebd_ext = xp.reshape(
-            xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
-            (nframes, nall, self.tebd_dim),
-        )
+        if self.use_loc_mapping:
+            node_ebd_ext = xp.reshape(
+                xp.take(type_embedding, xp.reshape(atype_ext[:, :nloc], [-1]), axis=0),
+                (nframes, nloc, self.tebd_dim),
+            )
+        else:
+            node_ebd_ext = xp.reshape(
+                xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
+                (nframes, nall, self.tebd_dim),
+            )
         node_ebd_inp = node_ebd_ext[:, :nloc, :]
         # repflows
         node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
@@ -563,7 +575,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa3",
-            "@version": 1,
+            "@version": 2,
             "ntypes": self.ntypes,
             "repflow_args": self.repflow_args.serialize(),
             "concat_output_tebd": self.concat_output_tebd,
@@ -574,6 +586,7 @@ def serialize(self) -> dict:
             "trainable": self.trainable,
             "use_econf_tebd": self.use_econf_tebd,
             "use_tebd_bias": self.use_tebd_bias,
+            "use_loc_mapping": self.use_loc_mapping,
             "type_map": self.type_map,
             "type_embedding": self.type_embedding.serialize(),
         }
@@ -598,7 +611,7 @@ def serialize(self) -> dict:
     def deserialize(cls, data: dict) -> "DescrptDPA3":
         data = data.copy()
         version = data.pop("@version")
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(version, 2, 1)
         data.pop("@class")
         data.pop("type")
         repflow_variable = data.pop("repflow_variable").copy()
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index df0b81d9d2..a7158da1aa 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -145,6 +145,9 @@ class DescrptBlockRepflows(NativeOP, DescriptorBlock):
         In the dynamic selection case, neighbor-scale normalization will use `e_sel / sel_reduce_factor`
         or `a_sel / sel_reduce_factor` instead of the raw `e_sel` or `a_sel` values,
         accommodating larger selection numbers.
+    use_loc_mapping : bool, optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     ntypes : int
         Number of element types
     activation_function : str, optional
@@ -196,6 +199,7 @@ def __init__(
         use_exp_switch: bool = False,
         use_dynamic_sel: bool = False,
         sel_reduce_factor: float = 10.0,
+        use_loc_mapping: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
         super().__init__()
@@ -229,6 +233,7 @@ def __init__(
         self.smooth_edge_update = smooth_edge_update
         self.use_exp_switch = use_exp_switch
         self.use_dynamic_sel = use_dynamic_sel
+        self.use_loc_mapping = use_loc_mapping
         self.sel_reduce_factor = sel_reduce_factor
         if self.use_dynamic_sel and not self.smooth_edge_update:
             raise NotImplementedError(
@@ -527,10 +532,22 @@ def call(
             cosine_ij, (nframes, nloc, self.a_sel, self.a_sel, 1)
         ) / (xp.pi**0.5)
 
+        if self.use_loc_mapping:
+            assert mapping is not None
+            flat_map = xp.reshape(mapping, (nframes, -1))
+            nlist = xp.reshape(
+                xp_take_along_axis(flat_map, xp.reshape(nlist, (nframes, -1)), axis=1),
+                nlist.shape,
+            )
+
         if self.use_dynamic_sel:
             # get graph index
             edge_index, angle_index = get_graph_index(
-                nlist, nlist_mask, a_nlist_mask, nall
+                nlist,
+                nlist_mask,
+                a_nlist_mask,
+                nall,
+                use_loc_mapping=self.use_loc_mapping,
             )
             # flat all the tensors
             # n_edge x 1
@@ -561,7 +578,11 @@ def call(
         for idx, ll in enumerate(self.layers):
             # node_ebd:     nb x nloc x n_dim
             # node_ebd_ext: nb x nall x n_dim
-            node_ebd_ext = xp_take_along_axis(node_ebd, mapping, axis=1)
+            node_ebd_ext = (
+                node_ebd
+                if self.use_loc_mapping
+                else xp_take_along_axis(node_ebd, mapping, axis=1)
+            )
             node_ebd, edge_ebd, angle_ebd = ll.call(
                 node_ebd_ext,
                 edge_ebd,
@@ -667,6 +688,7 @@ def serialize(self):
             "smooth_edge_update": self.smooth_edge_update,
             "use_dynamic_sel": self.use_dynamic_sel,
             "sel_reduce_factor": self.sel_reduce_factor,
+            "use_loc_mapping": self.use_loc_mapping,
             # variables
             "edge_embd": self.edge_embd.serialize(),
             "angle_embd": self.angle_embd.serialize(),
diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
index 6a3c6d8081..a1ac9c3797 100644
--- a/deepmd/dpmodel/utils/network.py
+++ b/deepmd/dpmodel/utils/network.py
@@ -1006,6 +1006,7 @@ def get_graph_index(
     nlist_mask: np.ndarray,
     a_nlist_mask: np.ndarray,
     nall: int,
+    use_loc_mapping: bool = True,
 ):
     """
     Get the index mapping for edge graph and angle graph, ready in `aggregate` or `index_select`.
@@ -1020,6 +1021,9 @@ def get_graph_index(
         Masks of the neighbor list for angle. real nei 1 otherwise 0
     nall
         The number of extended atoms.
+    use_loc_mapping
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
 
     Returns
     -------
@@ -1060,7 +1064,9 @@ def get_graph_index(
     n2e_index = n2e_index[xp.astype(nlist_mask, xp.bool)]
 
     # node_ext(j) to edge(ij) index_select
-    frame_shift = xp.arange(nf, dtype=nlist.dtype) * nall
+    frame_shift = xp.arange(nf, dtype=nlist.dtype) * (
+        nall if not use_loc_mapping else nloc
+    )
     shifted_nlist = nlist + frame_shift[:, xp.newaxis, xp.newaxis]
     # n_edge
     n_ext2e_index = shifted_nlist[xp.astype(nlist_mask, xp.bool)]
diff --git a/deepmd/pd/model/descriptor/dpa3.py b/deepmd/pd/model/descriptor/dpa3.py
index 47b30c5349..82f0351205 100644
--- a/deepmd/pd/model/descriptor/dpa3.py
+++ b/deepmd/pd/model/descriptor/dpa3.py
@@ -89,6 +89,9 @@ class DescrptDPA3(BaseDescriptor, paddle.nn.Layer):
         Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
         Whether to use bias in the type embedding layer.
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        Not supported yet in Paddle.
     type_map : list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
@@ -108,6 +111,7 @@ def __init__(
         seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
+        use_loc_mapping: bool = False,
         type_map: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
@@ -152,6 +156,7 @@ def init_subclass_params(sub_data, sub_class):
             smooth_edge_update=self.repflow_args.smooth_edge_update,
             use_dynamic_sel=self.repflow_args.use_dynamic_sel,
             sel_reduce_factor=self.repflow_args.sel_reduce_factor,
+            use_loc_mapping=use_loc_mapping,
             exclude_types=exclude_types,
             env_protection=env_protection,
             precision=precision,
@@ -160,6 +165,7 @@ def init_subclass_params(sub_data, sub_class):
 
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
+        self.use_loc_mapping = use_loc_mapping
         self.type_map = type_map
         self.tebd_dim = self.repflow_args.n_dim
         self.type_embedding = TypeEmbedNet(
@@ -370,7 +376,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa3",
-            "@version": 1,
+            "@version": 2,
             "ntypes": self.ntypes,
             "repflow_args": self.repflow_args.serialize(),
             "concat_output_tebd": self.concat_output_tebd,
@@ -381,6 +387,7 @@ def serialize(self) -> dict:
             "trainable": self.trainable,
             "use_econf_tebd": self.use_econf_tebd,
             "use_tebd_bias": self.use_tebd_bias,
+            "use_loc_mapping": self.use_loc_mapping,
             "type_map": self.type_map,
             "type_embedding": self.type_embedding.embedding.serialize(),
         }
@@ -405,7 +412,7 @@ def serialize(self) -> dict:
     def deserialize(cls, data: dict) -> "DescrptDPA3":
         data = data.copy()
         version = data.pop("@version")
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(version, 2, 1)
         data.pop("@class")
         data.pop("type")
         repflow_variable = data.pop("repflow_variable").copy()
diff --git a/deepmd/pd/model/descriptor/repflows.py b/deepmd/pd/model/descriptor/repflows.py
index bf15db35c1..3200c26dba 100644
--- a/deepmd/pd/model/descriptor/repflows.py
+++ b/deepmd/pd/model/descriptor/repflows.py
@@ -112,6 +112,9 @@ class DescrptBlockRepflows(DescriptorBlock):
     optim_update : bool, optional
         Whether to enable the optimized update method.
         Uses a more efficient process when enabled. Defaults to True
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        Not supported yet in Paddle.
     ntypes : int
         Number of element types
     activation_function : str, optional
@@ -161,6 +164,7 @@ def __init__(
         smooth_edge_update: bool = False,
         use_dynamic_sel: bool = False,
         sel_reduce_factor: float = 10.0,
+        use_loc_mapping: bool = False,
         optim_update: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
@@ -196,6 +200,8 @@ def __init__(
         self.use_dynamic_sel = use_dynamic_sel  # not supported yet
         self.sel_reduce_factor = sel_reduce_factor
         assert not self.use_dynamic_sel, "Dynamic selection is not supported yet."
+        self.use_loc_mapping = use_loc_mapping
+        assert not self.use_loc_mapping, "Local mapping is not supported yet."
 
         self.n_dim = n_dim
         self.e_dim = e_dim
diff --git a/deepmd/pt/model/descriptor/dpa3.py b/deepmd/pt/model/descriptor/dpa3.py
index de7b25749d..528c087374 100644
--- a/deepmd/pt/model/descriptor/dpa3.py
+++ b/deepmd/pt/model/descriptor/dpa3.py
@@ -89,6 +89,9 @@ class DescrptDPA3(BaseDescriptor, torch.nn.Module):
         Whether to use electronic configuration type embedding.
     use_tebd_bias : bool, Optional
         Whether to use bias in the type embedding layer.
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     type_map : list[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
@@ -108,6 +111,7 @@ def __init__(
         seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
+        use_loc_mapping: bool = True,
         type_map: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
@@ -153,6 +157,7 @@ def init_subclass_params(sub_data, sub_class):
             use_exp_switch=self.repflow_args.use_exp_switch,
             use_dynamic_sel=self.repflow_args.use_dynamic_sel,
             sel_reduce_factor=self.repflow_args.sel_reduce_factor,
+            use_loc_mapping=use_loc_mapping,
             exclude_types=exclude_types,
             env_protection=env_protection,
             precision=precision,
@@ -160,6 +165,7 @@ def init_subclass_params(sub_data, sub_class):
         )
 
         self.use_econf_tebd = use_econf_tebd
+        self.use_loc_mapping = use_loc_mapping
         self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
         self.tebd_dim = self.repflow_args.n_dim
@@ -365,7 +371,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa3",
-            "@version": 1,
+            "@version": 2,
             "ntypes": self.ntypes,
             "repflow_args": self.repflow_args.serialize(),
             "concat_output_tebd": self.concat_output_tebd,
@@ -376,6 +382,7 @@ def serialize(self) -> dict:
             "trainable": self.trainable,
             "use_econf_tebd": self.use_econf_tebd,
             "use_tebd_bias": self.use_tebd_bias,
+            "use_loc_mapping": self.use_loc_mapping,
             "type_map": self.type_map,
             "type_embedding": self.type_embedding.embedding.serialize(),
         }
@@ -400,7 +407,7 @@ def serialize(self) -> dict:
     def deserialize(cls, data: dict) -> "DescrptDPA3":
         data = data.copy()
         version = data.pop("@version")
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(version, 2, 1)
         data.pop("@class")
         data.pop("type")
         repflow_variable = data.pop("repflow_variable").copy()
@@ -469,12 +476,16 @@ def forward(
             The smooth switch function. shape: nf x nloc x nnei
 
         """
+        parallel_mode = comm_dict is not None
         # cast the input to internal precsion
         extended_coord = extended_coord.to(dtype=self.prec)
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.view(nframes, -1).shape[1] // 3
 
-        node_ebd_ext = self.type_embedding(extended_atype)
+        if not parallel_mode and self.use_loc_mapping:
+            node_ebd_ext = self.type_embedding(extended_atype[:, :nloc])
+        else:
+            node_ebd_ext = self.type_embedding(extended_atype)
         node_ebd_inp = node_ebd_ext[:, :nloc, :]
         # repflows
         node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
diff --git a/deepmd/pt/model/descriptor/repflow_layer.py b/deepmd/pt/model/descriptor/repflow_layer.py
index cc4b2a8865..37d4f07bb4 100644
--- a/deepmd/pt/model/descriptor/repflow_layer.py
+++ b/deepmd/pt/model/descriptor/repflow_layer.py
@@ -684,7 +684,7 @@ def optim_edge_update_dynamic(
 
     def forward(
         self,
-        node_ebd_ext: torch.Tensor,  # nf x nall x n_dim
+        node_ebd_ext: torch.Tensor,  # nf x nall x n_dim [OR] nf x nloc x n_dim when not parallel_mode
         edge_ebd: torch.Tensor,  # nf x nloc x nnei x e_dim
         h2: torch.Tensor,  # nf x nloc x nnei x 3
         angle_ebd: torch.Tensor,  # nf x nloc x a_nnei x a_nnei x a_dim
diff --git a/deepmd/pt/model/descriptor/repflows.py b/deepmd/pt/model/descriptor/repflows.py
index 1486ee358a..71e1d78cc5 100644
--- a/deepmd/pt/model/descriptor/repflows.py
+++ b/deepmd/pt/model/descriptor/repflows.py
@@ -156,6 +156,9 @@ class DescrptBlockRepflows(DescriptorBlock):
         In the dynamic selection case, neighbor-scale normalization will use `e_sel / sel_reduce_factor`
         or `a_sel / sel_reduce_factor` instead of the raw `e_sel` or `a_sel` values,
         accommodating larger selection numbers.
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     optim_update : bool, optional
         Whether to enable the optimized update method.
         Uses a more efficient process when enabled. Defaults to True
@@ -209,6 +212,7 @@ def __init__(
         use_exp_switch: bool = False,
         use_dynamic_sel: bool = False,
         sel_reduce_factor: float = 10.0,
+        use_loc_mapping: bool = True,
         optim_update: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
     ) -> None:
@@ -239,6 +243,7 @@ def __init__(
         self.fix_stat_std = fix_stat_std
         self.set_stddev_constant = fix_stat_std != 0.0
         self.a_compress_use_split = a_compress_use_split
+        self.use_loc_mapping = use_loc_mapping
         self.optim_update = optim_update
         self.smooth_edge_update = smooth_edge_update
         self.use_exp_switch = use_exp_switch
@@ -416,9 +421,9 @@ def forward(
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
     ):
-        if comm_dict is None:
+        parallel_mode = comm_dict is not None
+        if not parallel_mode:
             assert mapping is not None
-            assert extended_atype_embd is not None
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.view(nframes, -1).shape[1] // 3
         atype = extended_atype[:, :nloc]
@@ -470,12 +475,9 @@ def forward(
 
         # get node embedding
         # [nframes, nloc, tebd_dim]
-        if comm_dict is None:
-            assert isinstance(extended_atype_embd, torch.Tensor)  # for jit
-            atype_embd = extended_atype_embd[:, :nloc, :]
-            assert list(atype_embd.shape) == [nframes, nloc, self.n_dim]
-        else:
-            atype_embd = extended_atype_embd
+        assert extended_atype_embd is not None
+        atype_embd = extended_atype_embd[:, :nloc, :]
+        assert list(atype_embd.shape) == [nframes, nloc, self.n_dim]
         assert isinstance(atype_embd, torch.Tensor)  # for jit
         node_ebd = self.act(atype_embd)
         n_dim = node_ebd.shape[-1]
@@ -494,10 +496,22 @@ def forward(
         cosine_ij = torch.matmul(normalized_diff_i, normalized_diff_j) * (1 - 1e-6)
         angle_input = cosine_ij.unsqueeze(-1) / (torch.pi**0.5)
 
+        if not parallel_mode and self.use_loc_mapping:
+            assert mapping is not None
+            # convert nlist from nall to nloc index
+            nlist = torch.gather(
+                mapping,
+                1,
+                index=nlist.reshape(nframes, -1),
+            ).reshape(nlist.shape)
         if self.use_dynamic_sel:
             # get graph index
             edge_index, angle_index = get_graph_index(
-                nlist, nlist_mask, a_nlist_mask, nall
+                nlist,
+                nlist_mask,
+                a_nlist_mask,
+                nall,
+                use_loc_mapping=self.use_loc_mapping,
             )
             # flat all the tensors
             # n_edge x 1
@@ -524,18 +538,23 @@ def forward(
         angle_ebd = self.angle_embd(angle_input)
 
         # nb x nall x n_dim
-        if comm_dict is None:
+        if not parallel_mode:
             assert mapping is not None
             mapping = (
                 mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.n_dim)
             )
         for idx, ll in enumerate(self.layers):
             # node_ebd:     nb x nloc x n_dim
-            # node_ebd_ext: nb x nall x n_dim
-            if comm_dict is None:
+            # node_ebd_ext: nb x nall x n_dim [OR] nb x nloc x n_dim when not parallel_mode
+            if not parallel_mode:
                 assert mapping is not None
-                node_ebd_ext = torch.gather(node_ebd, 1, mapping)
+                node_ebd_ext = (
+                    torch.gather(node_ebd, 1, mapping)
+                    if not self.use_loc_mapping
+                    else node_ebd
+                )
             else:
+                assert comm_dict is not None
                 has_spin = "has_spin" in comm_dict
                 if not has_spin:
                     n_padding = nall - nloc
diff --git a/deepmd/pt/model/network/utils.py b/deepmd/pt/model/network/utils.py
index f9837952fe..dd1dc102a3 100644
--- a/deepmd/pt/model/network/utils.py
+++ b/deepmd/pt/model/network/utils.py
@@ -51,6 +51,7 @@ def get_graph_index(
     nlist_mask: torch.Tensor,
     a_nlist_mask: torch.Tensor,
     nall: int,
+    use_loc_mapping: bool = True,
 ):
     """
     Get the index mapping for edge graph and angle graph, ready in `aggregate` or `index_select`.
@@ -100,7 +101,9 @@ def get_graph_index(
     n2e_index = n2e_index[nlist_mask]  # graph node index, atom_graph[:, 0]
 
     # node_ext(j) to edge(ij) index_select
-    frame_shift = torch.arange(0, nf, dtype=nlist.dtype, device=nlist.device) * nall
+    frame_shift = torch.arange(0, nf, dtype=nlist.dtype, device=nlist.device) * (
+        nall if not use_loc_mapping else nloc
+    )
     shifted_nlist = nlist + frame_shift[:, None, None]
     # n_edge
     n_ext2e_index = shifted_nlist[nlist_mask]  # graph neighbor index, atom_graph[:, 1]
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 6e9663592f..38d99e00ed 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1372,6 +1372,10 @@ def descrpt_dpa3_args():
     doc_seed = "Random seed for parameter initialization."
     doc_use_econf_tebd = "Whether to use electronic configuration type embedding."
     doc_use_tebd_bias = "Whether to use bias in the type embedding layer."
+    doc_use_loc_mapping = (
+        "Whether to use local atom index mapping in training or non-parallel inference. "
+        "When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation."
+    )
     return [
         # doc_repflow args
         Argument("repflow", dict, dpa3_repflow_args(), doc=doc_repflow),
@@ -1421,6 +1425,13 @@ def descrpt_dpa3_args():
             default=False,
             doc=doc_use_tebd_bias,
         ),
+        Argument(
+            "use_loc_mapping",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_use_loc_mapping,
+        ),
     ]
 
 
diff --git a/source/tests/consistent/descriptor/test_dpa3.py b/source/tests/consistent/descriptor/test_dpa3.py
index 2647da52b3..b0ad515619 100644
--- a/source/tests/consistent/descriptor/test_dpa3.py
+++ b/source/tests/consistent/descriptor/test_dpa3.py
@@ -60,15 +60,16 @@
 @parameterized(
     ("const",),  # update_residual_init
     ([], [[0, 1]]),  # exclude_types
-    (True, False),  # update_angle
+    (True,),  # update_angle
     (0, 1),  # a_compress_rate
     (1, 2),  # a_compress_e_rate
     (True,),  # a_compress_use_split
     (True, False),  # optim_update
     (True, False),  # use_exp_switch
     (True, False),  # use_dynamic_sel
+    (True, False),  # use_loc_mapping
     (0.3, 0.0),  # fix_stat_std
-    (1, 2),  # n_multi_edge_message
+    (1,),  # n_multi_edge_message
     ("float64",),  # precision
 )
 class TestDPA3(CommonTest, DescriptorTest, unittest.TestCase):
@@ -84,6 +85,7 @@ def data(self) -> dict:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -124,6 +126,7 @@ def data(self) -> dict:
             "precision": precision,
             "exclude_types": exclude_types,
             "env_protection": 0.0,
+            "use_loc_mapping": use_loc_mapping,
             "trainable": True,
         }
 
@@ -139,6 +142,7 @@ def skip_pt(self) -> bool:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -157,6 +161,7 @@ def skip_pd(self) -> bool:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -166,6 +171,7 @@ def skip_pd(self) -> bool:
             or precision == "bfloat16"
             or use_exp_switch
             or use_dynamic_sel
+            or use_loc_mapping
         )  # not supported yet
 
     @property
@@ -180,6 +186,7 @@ def skip_dp(self) -> bool:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -198,6 +205,7 @@ def skip_tf(self) -> bool:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -258,6 +266,7 @@ def setUp(self) -> None:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -339,6 +348,7 @@ def rtol(self) -> float:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
@@ -363,6 +373,7 @@ def atol(self) -> float:
             optim_update,
             use_exp_switch,
             use_dynamic_sel,
+            use_loc_mapping,
             fix_stat_std,
             n_multi_edge_message,
             precision,
diff --git a/source/tests/pd/model/test_dpa3.py b/source/tests/pd/model/test_dpa3.py
index 4125e51ff0..f3aeedeecb 100644
--- a/source/tests/pd/model/test_dpa3.py
+++ b/source/tests/pd/model/test_dpa3.py
@@ -70,7 +70,10 @@ def test_consistency(
             rtol, atol = get_tols(prec)
             if prec == "float64":
                 atol = 1e-8  # marginal GPU test cases...
-
+            coord_ext = np.concatenate([self.coord_ext[:1], self.coord_ext[:1]], axis=0)
+            atype_ext = np.concatenate([self.atype_ext[:1], self.atype_ext[:1]], axis=0)
+            nlist = np.concatenate([self.nlist[:1], self.nlist[:1]], axis=0)
+            mapping = np.concatenate([self.mapping[:1], self.mapping[:1]], axis=0)
             repflow = RepFlowArgs(
                 n_dim=20,
                 e_dim=10,
@@ -108,18 +111,18 @@ def test_consistency(
             dd0.repflows.mean = paddle.to_tensor(davg, dtype=dtype, place=env.DEVICE)
             dd0.repflows.stddev = paddle.to_tensor(dstd, dtype=dtype, place=env.DEVICE)
             rd0, _, _, _, _ = dd0(
-                paddle.to_tensor(self.coord_ext, dtype=dtype, place=env.DEVICE),
-                paddle.to_tensor(self.atype_ext, dtype=paddle.int64, place=env.DEVICE),
-                paddle.to_tensor(self.nlist, dtype=paddle.int64, place=env.DEVICE),
-                paddle.to_tensor(self.mapping, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(coord_ext, dtype=dtype, place=env.DEVICE),
+                paddle.to_tensor(atype_ext, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(nlist, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(mapping, dtype=paddle.int64, place=env.DEVICE),
             )
             # serialization
             dd1 = DescrptDPA3.deserialize(dd0.serialize())
             rd1, _, _, _, _ = dd1(
-                paddle.to_tensor(self.coord_ext, dtype=dtype, place=env.DEVICE),
-                paddle.to_tensor(self.atype_ext, dtype=paddle.int64, place=env.DEVICE),
-                paddle.to_tensor(self.nlist, dtype=paddle.int64, place=env.DEVICE),
-                paddle.to_tensor(self.mapping, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(coord_ext, dtype=dtype, place=env.DEVICE),
+                paddle.to_tensor(atype_ext, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(nlist, dtype=paddle.int64, place=env.DEVICE),
+                paddle.to_tensor(mapping, dtype=paddle.int64, place=env.DEVICE),
             )
             np.testing.assert_allclose(
                 rd0.numpy(),
@@ -129,9 +132,7 @@ def test_consistency(
             )
             # dp impl
             dd2 = DPDescrptDPA3.deserialize(dd0.serialize())
-            rd2, _, _, _, _ = dd2.call(
-                self.coord_ext, self.atype_ext, self.nlist, self.mapping
-            )
+            rd2, _, _, _, _ = dd2.call(coord_ext, atype_ext, nlist, mapping)
             np.testing.assert_allclose(
                 rd0.numpy(),
                 rd2,
diff --git a/source/tests/pt/model/test_loc_mapping.py b/source/tests/pt/model/test_loc_mapping.py
new file mode 100644
index 0000000000..81b49da0ed
--- /dev/null
+++ b/source/tests/pt/model/test_loc_mapping.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import torch
+
+from deepmd.dpmodel.descriptor.dpa3 import (
+    RepFlowArgs,
+)
+from deepmd.pt.model.descriptor import (
+    DescrptDPA3,
+)
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+)
+from .test_mlp import (
+    get_tols,
+)
+
+dtype = env.GLOBAL_PT_FLOAT_PRECISION
+
+
+class TestDescrptDPA3LocMapping(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_consistency(
+        self,
+    ) -> None:
+        rng = np.random.default_rng(100)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        for (
+            ua,
+            rus,
+            ruri,
+            acr,
+            nme,
+            prec,
+            ect,
+            optim,
+        ) in itertools.product(
+            [True, False],  # update_angle
+            ["res_residual"],  # update_style
+            ["norm", "const"],  # update_residual_init
+            [0, 1],  # a_compress_rate
+            [1, 2],  # n_multi_edge_message
+            ["float64"],  # precision
+            [False],  # use_econf_tebd
+            [True, False],  # optim_update
+        ):
+            dtype = PRECISION_DICT[prec]
+            rtol, atol = get_tols(prec)
+            if prec == "float64":
+                atol = 1e-8  # marginal GPU test cases...
+
+            repflow = RepFlowArgs(
+                n_dim=20,
+                e_dim=10,
+                a_dim=10,
+                nlayers=3,
+                e_rcut=self.rcut,
+                e_rcut_smth=self.rcut_smth,
+                e_sel=nnei,
+                a_rcut=self.rcut - 0.1,
+                a_rcut_smth=self.rcut_smth,
+                a_sel=nnei,
+                a_compress_rate=acr,
+                n_multi_edge_message=nme,
+                axis_neuron=4,
+                update_angle=ua,
+                update_style=rus,
+                update_residual_init=ruri,
+                optim_update=optim,
+                smooth_edge_update=True,
+            )
+
+            # dpa3 new impl
+            dd0 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+                use_loc_mapping=False,
+            ).to(env.DEVICE)
+
+            # dpa3 using local mapping
+            dd1 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+                use_loc_mapping=True,
+            ).to(env.DEVICE)
+
+            coord_ext = np.concatenate([self.coord_ext[:1], self.coord_ext[:1]], axis=0)
+            atype_ext = np.concatenate([self.atype_ext[:1], self.atype_ext[:1]], axis=0)
+            nlist = np.concatenate([self.nlist[:1], self.nlist[:1]], axis=0)
+            mapping = np.concatenate([self.mapping[:1], self.mapping[:1]], axis=0)
+
+            dd0.repflows.mean = torch.tensor(davg, dtype=dtype, device=env.DEVICE)
+            dd0.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=env.DEVICE)
+            rd0, _, _, _, _ = dd0(
+                torch.tensor(coord_ext, dtype=dtype, device=env.DEVICE),
+                torch.tensor(atype_ext, dtype=int, device=env.DEVICE),
+                torch.tensor(nlist, dtype=int, device=env.DEVICE),
+                torch.tensor(mapping, dtype=int, device=env.DEVICE),
+            )
+
+            dd1.repflows.mean = torch.tensor(davg, dtype=dtype, device=env.DEVICE)
+            dd1.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=env.DEVICE)
+            rd1, _, _, _, _ = dd1(
+                torch.tensor(coord_ext, dtype=dtype, device=env.DEVICE),
+                torch.tensor(atype_ext, dtype=int, device=env.DEVICE),
+                torch.tensor(nlist, dtype=int, device=env.DEVICE),
+                torch.tensor(mapping, dtype=int, device=env.DEVICE),
+            )
+
+            np.testing.assert_allclose(
+                rd0.detach().cpu().numpy(),
+                rd1.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+            )
+
+    def test_consistency_nosel(
+        self,
+    ) -> None:
+        rng = np.random.default_rng(100)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        for (
+            ua,
+            rus,
+            ruri,
+            acr,
+            nme,
+            prec,
+            ect,
+            optim,
+        ) in itertools.product(
+            [True, False],  # update_angle
+            ["res_residual"],  # update_style
+            ["norm", "const"],  # update_residual_init
+            [0, 1],  # a_compress_rate
+            [1, 2],  # n_multi_edge_message
+            ["float64"],  # precision
+            [False],  # use_econf_tebd
+            [True, False],  # optim_update
+        ):
+            dtype = PRECISION_DICT[prec]
+            rtol, atol = get_tols(prec)
+            if prec == "float64":
+                atol = 1e-8  # marginal GPU test cases...
+
+            repflow = RepFlowArgs(
+                n_dim=20,
+                e_dim=10,
+                a_dim=10,
+                nlayers=3,
+                e_rcut=self.rcut,
+                e_rcut_smth=self.rcut_smth,
+                e_sel=nnei,
+                a_rcut=self.rcut - 0.1,
+                a_rcut_smth=self.rcut_smth,
+                a_sel=nnei,
+                a_compress_rate=acr,
+                n_multi_edge_message=nme,
+                axis_neuron=4,
+                update_angle=ua,
+                update_style=rus,
+                update_residual_init=ruri,
+                optim_update=optim,
+                smooth_edge_update=True,
+                use_dynamic_sel=True,
+                sel_reduce_factor=10.0,
+            )
+
+            # dpa3 new impl
+            dd0 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+                use_loc_mapping=False,
+            ).to(env.DEVICE)
+
+            # dpa3 using local mapping
+            dd1 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+                use_loc_mapping=True,
+            ).to(env.DEVICE)
+
+            coord_ext = np.concatenate([self.coord_ext[:1], self.coord_ext[:1]], axis=0)
+            atype_ext = np.concatenate([self.atype_ext[:1], self.atype_ext[:1]], axis=0)
+            nlist = np.concatenate([self.nlist[:1], self.nlist[:1]], axis=0)
+            mapping = np.concatenate([self.mapping[:1], self.mapping[:1]], axis=0)
+
+            dd0.repflows.mean = torch.tensor(davg, dtype=dtype, device=env.DEVICE)
+            dd0.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=env.DEVICE)
+            rd0, _, _, _, _ = dd0(
+                torch.tensor(coord_ext, dtype=dtype, device=env.DEVICE),
+                torch.tensor(atype_ext, dtype=int, device=env.DEVICE),
+                torch.tensor(nlist, dtype=int, device=env.DEVICE),
+                torch.tensor(mapping, dtype=int, device=env.DEVICE),
+            )
+
+            dd1.repflows.mean = torch.tensor(davg, dtype=dtype, device=env.DEVICE)
+            dd1.repflows.stddev = torch.tensor(dstd, dtype=dtype, device=env.DEVICE)
+            rd1, _, _, _, _ = dd1(
+                torch.tensor(coord_ext, dtype=dtype, device=env.DEVICE),
+                torch.tensor(atype_ext, dtype=int, device=env.DEVICE),
+                torch.tensor(nlist, dtype=int, device=env.DEVICE),
+                torch.tensor(mapping, dtype=int, device=env.DEVICE),
+            )
+
+            np.testing.assert_allclose(
+                rd0.detach().cpu().numpy(),
+                rd1.detach().cpu().numpy(),
+                rtol=rtol,
+                atol=atol,
+            )
diff --git a/source/tests/universal/common/cases/model/utils.py b/source/tests/universal/common/cases/model/utils.py
index 5a4c64c803..08a369933d 100644
--- a/source/tests/universal/common/cases/model/utils.py
+++ b/source/tests/universal/common/cases/model/utils.py
@@ -202,7 +202,10 @@ def test_forward(self) -> None:
                         assert rr is None
                     else:
                         np.testing.assert_allclose(
-                            subret[0], rr, err_msg=f"compare {kk} between 0 and {ii}"
+                            subret[0],
+                            rr,
+                            err_msg=f"compare {kk} between 0 and {ii}",
+                            atol=aprec,
                         )
         for kk in ret_lower[0].keys():
             subret = []
@@ -215,7 +218,10 @@ def test_forward(self) -> None:
                         assert rr is None
                     else:
                         np.testing.assert_allclose(
-                            subret[0], rr, err_msg=f"compare {kk} between 0 and {ii}"
+                            subret[0],
+                            rr,
+                            err_msg=f"compare {kk} between 0 and {ii}",
+                            atol=aprec,
                         )
         same_keys = set(ret[0].keys()) & set(ret_lower[0].keys())
         self.assertTrue(same_keys)
@@ -305,7 +311,10 @@ def test_zero_forward(self) -> None:
                         assert rr is None
                     else:
                         np.testing.assert_allclose(
-                            subret[0], rr, err_msg=f"compare {kk} between 0 and {ii}"
+                            subret[0],
+                            rr,
+                            err_msg=f"compare {kk} between 0 and {ii}",
+                            atol=aprec,
                         )
         for kk in ret_lower[0]:
             subret = []
@@ -318,7 +327,10 @@ def test_zero_forward(self) -> None:
                         assert rr is None
                     else:
                         np.testing.assert_allclose(
-                            subret[0], rr, err_msg=f"compare {kk} between 0 and {ii}"
+                            subret[0],
+                            rr,
+                            err_msg=f"compare {kk} between 0 and {ii}",
+                            atol=aprec,
                         )
         same_keys = set(ret[0].keys()) & set(ret_lower[0].keys())
         self.assertTrue(same_keys)
diff --git a/source/tests/universal/dpmodel/descriptor/test_descriptor.py b/source/tests/universal/dpmodel/descriptor/test_descriptor.py
index 08708c5924..cafd405285 100644
--- a/source/tests/universal/dpmodel/descriptor/test_descriptor.py
+++ b/source/tests/universal/dpmodel/descriptor/test_descriptor.py
@@ -486,6 +486,7 @@ def DescriptorParamDPA3(
     fix_stat_std=0.3,
     use_dynamic_sel=False,
     precision="float64",
+    use_loc_mapping=True,
 ):
     input_dict = {
         # kwargs for repformer
@@ -531,6 +532,7 @@ def DescriptorParamDPA3(
         "trainable": True,
         "use_econf_tebd": False,
         "use_tebd_bias": False,
+        "use_loc_mapping": use_loc_mapping,
         "type_map": type_map,
         "seed": GLOBAL_SEED,
     }
@@ -555,6 +557,7 @@ def DescriptorParamDPA3(
             "use_dynamic_sel": (True, False),
             "env_protection": (0.0, 1e-8),
             "precision": ("float64",),
+            "use_loc_mapping": (True, False),
         }
     ),
 )