22 KiB
xz_cable_setup_check_tool.py build
pyinstaller --onefile \
--paths=/home/gyou/NexusBench-baihe-br/nexusbench \
--paths=/home/gyou/NexusBench-baihe-br/nexusbench/log \
--paths=/home/gyou/NexusBench-baihe-br/nexusbench/connection \
--paths=/home/gyou/NexusBench-baihe-br/nexusbench/gpu/biren \
--hidden-import=connection.http_helper \
--runtime-tmpdir=/home/gyou/tmp \
/home/gyou/NexusBench-baihe-br/nexusbench/biren_test.py
# 加了这一行就可以运行了
--runtime-tmpdir=/home/gyou/tmp \
--hidden-import=log.logger \
--hidden-import=connection.ssh_connection_manager \
./build/whiteriver_exp --host 10.57.216.109 --exp 2 --cmd vcmd --param rev
./vuart -i 10.57.216.109 -e 2 -c vcmd -p ver ./vuart -i 10.57.216.108 -e 4 -c fw-down -p
./build/whiteriver_exp -i 10.57.216.108 -e 4 -c fw-down -p
./build/whiteriver_exp -i 10.57.216.91 -e 4 -c vcmd -p ver
net use X: \10.57.216.173\shared /user:root ossdbg1
PicT1!2@3#4$
RCms@Zte3
./build/whiteriver_exp -i 10.57.216.94 -e 4 -c fw-down -p "./whiteriver_exp@1.0.17+2508161844.img"
./build/whiteriver_exp -i 10.57.216.95,10.57.216.96,10.57.216.97,10.57.216.98
scp /usr/bin/ocs_link_reset root@10.57.216.166
scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.166:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.165:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.187:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.148:/usr/bin/
scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.163:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.139:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.173:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.167:/usr/bin/
scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.134:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.145:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.176:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.180:/usr/bin/
scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.185:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.150:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.168:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.174:/usr/bin/
scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.132:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.189:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.151:/usr/bin/ scp -o "BatchMode=no" -o "StrictHostKeyChecking=no" -i pass.txt /usr/bin/ocs_link_reset root@10.57.216.156:/usr/bin/
def read_tia_stage2_reg(self, exp_id: int, slot_id: int,
lane: int) -> int:
return self.read_opt_reg(exp_id, slot_id, lane, 'tia_stage2')
def write_tia_stage2_reg(self, exp_id: int, slot_id: int,
lane: int, wt_value: int) -> bool:
return self.write_opt_reg(exp_id, slot_id, lane, wt_value, 'tia_stage2')
def write_confirm_reg(self, exp_id: int, slot_id: int) -> bool:
return self.bmc.SetOpticalModuleRegs(exp_id, slot_id, 0, 0xd0, 0x88, 1, "01")
def read_vpeak_reg(self, exp_id: int, slot_id: int, lane: int) -> int:
return self.read_opt_reg(exp_id, slot_id, lane, 'vpeak')
def read_opt_reg(self, exp_id: int, slot_id: int,
lane: int, reg_name: str):
def calc_target_vpeak_new(self, exp_id: int, slot_id: int, lane_list: List[int]) -> Dict[int, int]:
target_vpeaks : Dict[int, int] = {}
logging.info(f"-------slot {slot_id}")
# 切换成mgc
logging.info("----------step 1: disable agc")
self.disable_agc(exp_id, slot_id)
# 关闭RF
logging.info("----------step 2: toogle RF off")
if not self.toogle_rf(exp_id, slot_id, lane_list, "off"):
logging.error(f"slot {slot_id}: toogle RF off fail")
return target_vpeaks
# 读取base_line_vpeaks
logging.info("----------step 3: get base_line_vpeaks, and calculate target vpeaks")
base_line_vpeaks = self._read_vpeak_all_lanes(exp_id, slot_id, lane_list)
logging.info(f"base_line_vpeaks: {base_line_vpeaks}")
self.toogle_rf(exp_id, slot_id, lane_list, "on")
self._write_mgc_all_lanes(exp_id, slot_id, lane_list, 255)
time.sleep(0.05) # important
max_vpeaks = self._read_vpeak_all_lanes(exp_id, slot_id, lane_list)
for lane, base_vpeak in base_line_vpeaks.items():
numerator = 16
vpkdelta = round((max_vpeaks[lane] - base_vpeak) * (numerator / 29))
logging.info(f'-------numerator:{numerator}')
target_vpeaks[lane] = vpkdelta + base_vpeak
logging.info(f"target_vpeaks: {target_vpeaks}")
if target_vpeaks:
self.save_target_vpeaks_to_json(exp_id, slot_id, target_vpeaks)
# 开启RF
logging.info("----------step 4: enable RF")
# time.sleep(5)
return target_vpeaks
def match_optimal_mgc_new(self, exp_id: int, slot_id: int, lane_list: List[int], target_vpeaks: Dict[int, int]) -> bool: matched_results = {} unmatched_lanes = []
for lane_id in lane_list:
is_match = False
matched_mgc = None
matched_vpeak = None
reg = self.reg_table.get_register_by_logic_lane('mgc', lane_id)
if reg is None or reg.valid_range is None:
logging.error(f'match_optimal_mgc_new error. exp:{exp_id}, slot:{slot_id}, lane: {lane_id}, register name: mgc')
return False
min_val = reg.valid_range[0]
max_val = reg.valid_range[1]
step = reg.step if reg.step is not None else 1
wt_mgc = int((min_val + max_val) / 2)
target_vpeak = target_vpeaks[lane_id] # example: 100
target_vpeak_range = [target_vpeak, target_vpeak + 1] # 99,101
while True:
ret = self.reg_tool.write_mgc_reg(exp_id, slot_id, lane_id, wt_mgc)
if ret == False:
logging.error(f'match_optimal_mgc_new error. exp:{exp_id}, slot:{slot_id}, lane: {lane_id}, register name: mgc')
return False
time.sleep(0.05)
ret = self.reg_tool.write_confirm_reg(exp_id, slot_id)
if ret == False:
logging.error(f'match_optimal_mgc_new error. exp:{exp_id}, slot:{slot_id}, lane: {lane_id}, register name: confirm')
return False
time.sleep(0.05)
current_vpeak = self.reg_tool.read_vpeak_reg(exp_id, slot_id, lane_id)
logging.info(f"exp_id:{exp_id}, slot_id:{slot_id}, lane:{lane_id} -> set mgc {wt_mgc}, Vpeak value: {current_vpeak}, target: {target_vpeak}")
diff_vpeak = abs(target_vpeak - current_vpeak)
diff_vpeak_step = {
0: 0,
1: 0,
2: 1,
3: 1,
4: 2,
5: 3,
6: 3,
7: 4,
8: 5,
}
extra_step = diff_vpeak_step.get(diff_vpeak, 5)
if current_vpeak > target_vpeak_range[1]:
wt_mgc -= step + extra_step
if current_vpeak < target_vpeak_range[0]:
wt_mgc += step + extra_step
if current_vpeak >= target_vpeak_range[0] and current_vpeak <= target_vpeak_range[1]:
is_match = True
matched_mgc = wt_mgc
matched_vpeak = current_vpeak
break
if is_match:
logging.info(f"------------------ Vpeak is matched, target vpeak:{target_vpeak}, current vpeak: {current_vpeak}, current mgc:{wt_mgc} ")
if str(exp_id) not in matched_results:
matched_results[str(exp_id)] = {}
if str(slot_id) not in matched_results[str(exp_id)]:
matched_results[str(exp_id)][str(slot_id)] = {}
if self.mode not in matched_results[str(exp_id)][str(slot_id)]:
matched_results[str(exp_id)][str(slot_id)][self.mode] = {}
matched_results[str(exp_id)][str(slot_id)][self.mode][str(reg.lane)] = {
"mgc": matched_mgc,
"target_vpeak": target_vpeaks[lane_id],
"actual_vpeak": matched_vpeak
}
else:
unmatched_lanes.append(lane_id)
if matched_results:
self.save_mgc_results_to_json(exp_id, slot_id, matched_results)
if unmatched_lanes:
logging.error(f"以下lane未匹配到合适的MGC值: {unmatched_lanes}")
return False
return True
remote_slot = self.topo_map.get_remote_slot_by_retimer(eye.rt_index, eye.lane_index, self.route_name)
local_slot = self.topo_map.get_local_slot_by_retimer(eye.rt_index, eye.lane_index, self.route_name)
import re
import yaml from typing import List, Dict, Optional, Tuple, Iterator from dataclasses import dataclass
@dataclass(frozen=True) class Side: """ Represents one side of a link (e.g., host side or device side), explicitly containing GPU, Retimer, and Slot with their indices and lanes. """ gpu_index: int gpu_lane: int retimer_index: int retimer_lane: int slot_index: int slot_lane: int
def __str__(self) -> str:
return (f"GPU{self.gpu_index}_L{self.gpu_lane} <-> "
f"RTMR{self.retimer_index}_L{self.retimer_lane} <-> "
f"SLOT{self.slot_index}_L{self.slot_lane}")
@dataclass(frozen=True) class TopoLink: """ Represents a full physical link between two sides (A and B). Example: Side(GPU0, RTMR21, SLOT0) <-> Side(GPU6, RTMR32, SLOT0) """ side_a: Side side_b: Side route_name: str
def __str__(self) -> str:
return f"{self.side_a} <-> {self.side_b}"
@dataclass class Route: name: str links: List[TopoLink]
def __iter__(self) -> Iterator[TopoLink]:
return iter(self.links)
def __len__(self) -> int:
return len(self.links)
class TopoMappingParser: """ Parses topology YAML file into structured links with explicit Side components. Provides utility methods to query relationships between Retimer, Slot, GPU. All query methods now support optional filtering by route_name. """
# Regex to match: DEVICE<index>_L<lane>, e.g., GPU0_L0, RTMR21_L7, SLOT5_L3
_TOKEN_PATTERN = re.compile(r"([A-Z]+)(\d+)_L(\d+)")
def __init__(self, yaml_file: str):
self.yaml_file = yaml_file
self.routes: List[Route] = []
self._all_links: List[TopoLink] = [] # Flat index for fast lookup
def parse(self) -> 'TopoMappingParser':
"""Parse YAML and build structured links."""
with open(self.yaml_file, 'r') as f:
data = yaml.safe_load(f)
self.routes.clear()
self._all_links.clear()
for item in data:
if 'route_name' not in item or 'links' not in item:
continue
route_name = item['route_name']
links = []
for link_str in item['links']:
try:
link = self._parse_link(link_str.strip(), route_name)
links.append(link)
self._all_links.append(link)
except Exception as e:
raise ValueError(f"Failed to parse link '{link_str}' in route '{route_name}': {e}")
self.routes.append(Route(route_name, links))
return self
def _parse_link(self, link_str: str, route_name: str) -> TopoLink:
"""
Parse a link string into two structured Sides (A and B).
Assumes format: GPUx_La <-> RTMRy_Lb <-> SLOTz_Lc <-> ... <-> GPUx_La
And that both ends have: GPU, RTMR, SLOT in order.
Middle may have repeated SLOT/RTMR.
"""
tokens = [t.strip() for t in link_str.split('<->')]
if len(tokens) < 6:
raise ValueError(f"Link too short to extract both sides: {link_str}")
# Parse all nodes
nodes = [self._parse_token(tok) for tok in tokens]
# Find split point: assume symmetry, and RTMR is near both ends
rt_indices = [i for i, (t, _, _) in enumerate(nodes) if t == "RTMR"]
if len(rt_indices) < 2:
raise ValueError(f"Link must have at least two retimers: {link_str}")
mid = len(nodes) // 2
# Extract Side A (from start to middle RTMR)
side_a = self._extract_side(nodes[:mid + 1])
# Extract Side B (from middle to end)
side_b = self._extract_side(list(reversed(nodes[mid:])))
return TopoLink(side_a=side_a, side_b=side_b, route_name=route_name)
def _parse_token(self, token: str) -> Tuple[str, int, int]:
"""
Parse token like 'GPU0_L0' into (type, index, lane)
"""
match = self._TOKEN_PATTERN.fullmatch(token)
if not match:
raise ValueError(f"Invalid token format: '{token}'")
dev_type, idx, lane = match.groups()
return dev_type, int(idx), int(lane)
def _extract_side(self, nodes: List[Tuple[str, int, int]]) -> Side:
"""
Extract GPU, RTMR, SLOT from a list of nodes (assumed to be one side).
Picks the first occurrence of each.
"""
gpu = retimer = slot = None
for dev_type, idx, lane in nodes:
if dev_type == "GPU" and gpu is None:
gpu = (idx, lane)
elif dev_type == "RTMR" and retimer is None:
retimer = (idx, lane)
elif dev_type == "SLOT" and slot is None:
slot = (idx, lane)
if not gpu or not retimer or not slot:
raise ValueError(f"Missing components in side: GPU={gpu}, RTMR={retimer}, SLOT={slot}")
return Side(
gpu_index=gpu[0], gpu_lane=gpu[1],
retimer_index=retimer[0], retimer_lane=retimer[1],
slot_index=slot[0], slot_lane=slot[1]
)
# --------------------------------------------------
# 🔧 UTILITY METHODS (now support route_name filtering)
# --------------------------------------------------
def _filter_links(self, links: List[TopoLink], route_name: Optional[str]) -> List[TopoLink]:
"""Helper to filter links by route_name if provided."""
if route_name is None:
return links
return [link for link in links if link.route_name == route_name]
def get_links_by_retimer(self, rtmr_idx: int, rtmr_lane: int, route_name: Optional[str] = None) -> List[TopoLink]:
"""Get all links containing the given retimer (index and lane), optionally filtered by route_name."""
links = [link for link in self._all_links
if (link.side_a.retimer_index == rtmr_idx and link.side_a.retimer_lane == rtmr_lane)
or (link.side_b.retimer_index == rtmr_idx and link.side_b.retimer_lane == rtmr_lane)]
return self._filter_links(links, route_name)
def get_local_side_by_retimer(self, rtmr_idx: int, rtmr_lane: int, route_name: Optional[str] = None) -> Optional[Side]:
"""
Given a retimer (index, lane), return the local Side (GPU + SLOT on same side),
optionally filtered by route_name.
"""
links = self.get_links_by_retimer(rtmr_idx, rtmr_lane, route_name)
for link in links:
if link.side_a.retimer_index == rtmr_idx and link.side_a.retimer_lane == rtmr_lane:
return link.side_a
if link.side_b.retimer_index == rtmr_idx and link.side_b.retimer_lane == rtmr_lane:
return link.side_b
return None
def get_remote_side_by_retimer(self, rtmr_idx: int, rtmr_lane: int, route_name: Optional[str] = None) -> Optional[Side]:
"""
Given a retimer (index, lane), return the *remote* Side (the other end),
optionally filtered by route_name.
"""
links = self.get_links_by_retimer(rtmr_idx, rtmr_lane, route_name)
for link in links:
if link.side_a.retimer_index == rtmr_idx and link.side_a.retimer_lane == rtmr_lane:
return link.side_b
if link.side_b.retimer_index == rtmr_idx and link.side_b.retimer_lane == rtmr_lane:
return link.side_a
return None
def get_local_slot_by_retimer(self, rtmr_idx: int, rtmr_lane: int, route_name: Optional[str] = None) -> Optional[Tuple[int, int]]:
"""
Get (slot_index, slot_lane) on the same side as the given retimer,
optionally filtered by route_name.
"""
side = self.get_local_side_by_retimer(rtmr_idx, rtmr_lane, route_name)
return (side.slot_index, side.slot_lane) if side else None
def get_remote_slot_by_retimer(self, rtmr_idx: int, rtmr_lane: int, route_name: str) -> Optional[Tuple[int, int]]:
"""
Get (slot_index, slot_lane) on the opposite side of the given retimer,
optionally filtered by route_name.
"""
side = self.get_remote_side_by_retimer(rtmr_idx, rtmr_lane, route_name)
return (side.slot_index, side.slot_lane) if side else None
def get_retimer_by_slot(self, slot_idx: int, slot_lane: int, route_name: str) -> Optional[Tuple[int, int, int, int]]:
"""
Given a slot (index, lane), return:
(retimer_index, retimer_lane, peer_slot_idx, peer_slot_lane)
Optionally filtered by route_name.
"""
links = self._filter_links(self._all_links, route_name)
for link in links:
if (link.side_a.slot_index == slot_idx and link.side_a.slot_lane == slot_lane):
return (
link.side_a.retimer_index,
link.side_a.retimer_lane,
link.side_b.slot_index,
link.side_b.slot_lane
)
if (link.side_b.slot_index == slot_idx and link.side_b.slot_lane == slot_lane):
return (
link.side_b.retimer_index,
link.side_b.retimer_lane,
link.side_a.slot_index,
link.side_a.slot_lane
)
return None
def get_gpu_by_slot(self, slot_idx: int, slot_lane: int, route_name: Optional[str] = None) -> Optional[Tuple[int, int]]:
"""Get (gpu_index, gpu_lane) connected to the given slot, optionally filtered by route_name."""
links = self._filter_links(self._all_links, route_name)
for link in links:
if link.side_a.slot_index == slot_idx and link.side_a.slot_lane == slot_lane:
return (link.side_a.gpu_index, link.side_a.gpu_lane)
if link.side_b.slot_index == slot_idx and link.side_b.slot_lane == slot_lane:
return (link.side_b.gpu_index, link.side_b.gpu_lane)
return None
def find_symmetric_links(self, route_name: Optional[str] = None) -> List[TopoLink]:
"""Find links where both sides are identical (e.g., oneta loops), optionally filtered by route_name."""
links = self._filter_links(self._all_links, route_name)
return [
link for link in links
if (link.side_a.gpu_index == link.side_b.gpu_index and
link.side_a.retimer_index == link.side_b.retimer_index and
link.side_a.slot_index == link.side_b.slot_index and
link.side_a.gpu_lane == link.side_b.gpu_lane and
link.side_a.retimer_lane == link.side_b.retimer_lane and
link.side_a.slot_lane == link.side_b.slot_lane)
]
if name == "main": # 示例用法 parser = TopoMappingParser("./main_data/topo_mapping.yaml") parser.parse()
# 原始调用(跨所有 route)
print("All routes - local slot for RTMR21_L0:", parser.get_local_slot_by_retimer(21, 0))
print("All routes - remote slot for RTMR21_L0:", parser.get_remote_slot_by_retimer(21, 0))
# 按 route_name 查询
print("onoc6 - local slot for RTMR21_L0:", parser.get_local_slot_by_retimer(21, 0, route_name="onoc6"))
print("onoc6 - remote slot for RTMR21_L0:", parser.get_remote_slot_by_retimer(21, 0, route_name="onoc6"))
print("onoc5 - local slot for RTMR21_L0:", parser.get_local_slot_by_retimer(21, 0, route_name="onoc5"))
print("onoc5 - remote slot for RTMR21_L0:", parser.get_remote_slot_by_retimer(21, 0, route_name="onoc5"))
# 其他查询也支持 route_name
print("oneta - symmetric links count:", len(parser.find_symmetric_links(route_name="oneta")))
print("onoc6 - GPU connected to SLOT0_L0:", parser.get_gpu_by_slot(0, 0, route_name="onoc6"))
print("onoc5 - GPU connected to SLOT0_L0:", parser.get_gpu_by_slot(0, 0, route_name="onoc5"))