You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1 lines
227 KiB

to_yd/                                                                                              0000750 0000000 0000000 00000000000 15120512002 010651  5                                                                                                    ustar   root                            root                                                                                                                                                                                                                   to_yd/36_xz_crosstalk_ocs_rssi.sh                                                                   0000750 0000000 0000000 00000032674 15120511737 016204  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
wait_time=10
export exp_tool="./smbus-tool/build/whiteriver_exp.exe"

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""
host1=""
host2=""
test_port=""


while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --host)
            shift            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--host 后必须指定主机列表（如 --host 102,103）"
                exit 1
            fi
                       
            IFS=',' read -ra host_numbers <<< "$1"
             
            for ((i=0; i<${#host_numbers[@]}; i++)); do
                case $i in
                    0)
                        host1="${host_numbers[0]}"
                        ;;
                    1)
                        host2="${host_numbers[1]}"
                        ;;
                    *)
                esac
            done
            shift
            ;;
		  --port)
            shift            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--port 后必须指定节点编号（如 --port 4）"
                exit 1
            fi
            test_port="$1"
            shift
            ;;
  		
        *)
            echo "错误：未知参数 $1"
            echo "用法: $0 [--node NODE_ID] [--host HOST1,HOST2,...]"
            exit 1
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

first_node="$target_star_node"
second_node="$((target_star_node + 4))"
# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
node5_ip=$(jq -r ".\"$second_node\".node1_ip" "$config_file")
node5_bmc=$(jq -r ".\"$second_node\".node1_bmc" "$config_file")
node6_ip=$(jq -r ".\"$second_node\".node2_ip" "$config_file")
node6_bmc=$(jq -r ".\"$second_node\".node2_bmc" "$config_file")
node7_ip=$(jq -r ".\"$second_node\".node3_ip" "$config_file")
node7_bmc=$(jq -r ".\"$second_node\".node3_bmc" "$config_file")
node8_ip=$(jq -r ".\"$second_node\".node4_ip" "$config_file")
node8_bmc=$(jq -r ".\"$second_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

export SNODE1="$node1_bmc,$node2_bmc,$node3_bmc,$node4_bmc"
export SNODE2="$node5_bmc,$node6_bmc,$node7_bmc,$node8_bmc"


bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)
bmc_host5=$(echo "$node5_bmc" | cut -d '.' -f 4)
bmc_host6=$(echo "$node6_bmc" | cut -d '.' -f 4)
bmc_host7=$(echo "$node7_bmc" | cut -d '.' -f 4)
bmc_host8=$(echo "$node8_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}

collect_onet_cross() {
    local active_bmc=$1
	  local cross_bmc=$2
    local active_port=$3
	  local ocs_topo=$4
    local active_host="https://$active_bmc"
    local cross_host="https://$cross_bmc"
    local active_node=$(echo "$active_bmc" | cut -d '.' -f 4)
    local cross_node=$(echo "$cross_bmc" | cut -d '.' -f 4)
    local dis_tx_list=("$cross_bmc")
    IFS=',' read -ra snode1 <<< $SNODE1
    IFS=',' read -ra snode2 <<< $SNODE2
    echo -e "\n============= active_port: $active_port ==============="

    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=6; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
			echo $cmd
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    #disable cross ocs and other snode ocs    
    for index in "${!snode1[@]}"; do                
        if [ "${snode1[$index]}" = "$active_bmc" ]; then                    
            local dis_tx_list+=(${snode2[$index]}) 
        fi 
        
        if [ "${snode1[$index]}" = "$cross_bmc" ]; then                    
            local dis_tx_list+=(${snode2[$index]}) 
        fi 
        
    done
    
    for index in "${!snode2[@]}"; do 
        if [ "${snode2[$index]}" = "$active_bmc" ]; then                            
            local dis_tx_list+=(${snode1[$index]}) 
            
        fi 
        
        if [ "${snode2[$index]}" = "$cross_bmc" ]; then                    
            local dis_tx_list+=(${snode1[$index]}) 
        fi 
    done
    

    
    for tx_bmc in "${dis_tx_list[@]}"; do      
      for ocs in {1..8}; do
          for ((m=1; m<=6; m++)); do
              local cmd="$exp_tool --host https://$tx_bmc --port $active_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
              [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
              local output=$($cmd 2>&1)
              echo $output
              if ! echo "$output" | grep -q "Locked"; then
                  success=true
                  break
              fi 
              sleep 3
          done
      done
    done
    
    
    echo -e "\nsleep 10s, waiting ocs stable..."
    sleep 10
    
    echo -e "\n----- collect $active_bmc: p$active_port cmis data -----"
    for ((j=1; j<=6; j++)); do
      local active_port_cmis=$(ocsdiag -i $active_bmc -e $active_port -c vcmd -p "cmis mon" 2>&1)    
      [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
      
      if ! echo "$active_port_cmis" | grep -q "AsyncPoll failed|lock: Failed"; then
        success=true
        break
      fi 
      sleep 3
    done
    
    echo -e "\n----- collect $cross_bmc: p$active_port cmis data -----"
    for ((k=1; k<=6; k++)); do
      local cross_port_cmis=$(ocsdiag -i $cross_bmc -e $active_port -c vcmd -p "cmis mon" 2>&1)
      [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
      if ! echo "$cross_port_cmis" | grep -qE "AsyncPoll failed|lock: Failed"; then      
        success=true
        break
      fi 
      sleep 3
    done    
    
    echo -e "\n----- collect p$active_port RSSI data -----"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $active_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"
    export LOOP="$ocs_topo"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$active_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc and $cross_bmc: p$active_port 数据处理完成 ===============\n"
    sleep 6
    
    echo -e "\n============= enable active port: $active_port ==============="
    #activate_ocs 
    for tx_bmc in "${dis_tx_list[@]}"; do      
      for ocs in {1..8}; do
          for ((m=1; m<=6; m++)); do
              local cmd="$exp_tool --host https://$tx_bmc --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
              [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
              local output=$($cmd 2>&1)
              echo $output
              if ! echo "$output" | grep -q "Locked"; then
                  success=true
                  break
              fi 
              sleep 3
          done
      done
    done
}

export -f collect_onet_cross

link_crosstalk () {
  local active_bmc=$1
  local cross_bmc=$2	
	local active_port=$3
  local active_node=$(echo "$active_bmc" | cut -d '.' -f 4)
  local cross_node=$(echo "$cross_bmc" | cut -d '.' -f 4)

	###########onoc cross talk#############
	echo "==============test node:$active_node,$cross_node port:$active_port onoc cross talk==========="
	if [ "$active_port" -eq 4 ]; then
		local ocs_topo="onoc6"
	elif [ "$active_port" -eq 6 ]; then
		local ocs_topo="onoc7"
	elif [ "$active_port" -eq 8 ]; then
		local ocs_topo="onoc6"
	fi

	timeout 30s bash set_port_topo.sh $active_bmc $active_port $ocs_topo
	timeout 30s bash set_port_topo.sh $cross_bmc $active_port $ocs_topo
	timeout 120s bash -c 'collect_onet_cross "$@"' _ $active_bmc $cross_bmc $active_port $ocs_topo
	timeout 120s bash -c 'collect_onet_cross "$@"' _ $cross_bmc $active_bmc $active_port $ocs_topo

	
	###########onet cross talk#############
	echo "test node:$active_node,$cross_node port:$active_port onet cross talk"	
	local snode_a=$(((active_node - bmc_host1) / 4))
  local snode_c=$(((cross_node - bmc_host1) / 4)) 

  if [ $snode_a -eq $snode_c ]; then
		echo "the same snode set topo-onetb"
		local ocs_topo="onetb"	
	else
		echo "not the same snode set topo-oneta"
		local ocs_topo="oneta"		
	fi
	
	if [ "$active_port" -ne 8 ]; then
		timeout 30s bash set_port_topo.sh $active_bmc $active_port $ocs_topo
		timeout 30s bash set_port_topo.sh $cross_bmc $active_port $ocs_topo
		timeout 120s bash -c 'collect_onet_cross "$@"' _ $active_bmc $cross_bmc $active_port $ocs_topo
		timeout 120s bash -c 'collect_onet_cross "$@"' _ $cross_bmc $active_bmc $active_port $ocs_topo
	fi	
}
LOG_FILE="logs/summary_logs/64_xz-crosstalk_ocs_rssi_$bmc_host1_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/64_xz-crosstalk_ocs_rssi_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).csv"
export CSV_FILE="$csv_file"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

export debug_mode=1


if [ -n "$host1" ] && [ -n "$host2" ]; then
	if [[ "$host1" == *.* ]]; then
    node1_bmcip="$host1"
    
    node2_bmcip="$host2"
  else
    node1_bmcip="${node1_bmc%.*}.$host1"
    node2_bmcip="${node1_bmc%.*}.$host2"
  fi
    
	link_crosstalk $node1_bmcip $node2_bmcip $test_port    
else
  echo "------ collect all cross talk data ==="
	link_crosstalk $node8_bmc $node6_bmc 4
	link_crosstalk $node8_bmc $node5_bmc 6
	link_crosstalk $node7_bmc $node5_bmc 4
	link_crosstalk $node7_bmc $node6_bmc 6
	link_crosstalk $node6_bmc $node4_bmc 4
	link_crosstalk $node6_bmc $node3_bmc 6
	link_crosstalk $node5_bmc $node3_bmc 4
	link_crosstalk $node5_bmc $node4_bmc 6	
	link_crosstalk $node4_bmc $node2_bmc 4
	link_crosstalk $node4_bmc $node1_bmc 6
	link_crosstalk $node3_bmc $node1_bmc 4	
	link_crosstalk $node3_bmc $node2_bmc 6	
	link_crosstalk $node2_bmc $node8_bmc 4
	link_crosstalk $node2_bmc $node7_bmc 6
	link_crosstalk $node1_bmc $node7_bmc 4
	link_crosstalk $node1_bmc $node8_bmc 6
	#link_crosstalk $node8_bmc $node7_bmc 8
	#link_crosstalk $node6_bmc $node5_bmc 8
	#link_crosstalk $node4_bmc $node3_bmc 8
	#link_crosstalk $node2_bmc $node1_bmc 8

fi

unset debug_mode collect_onet_cross exp_tool CSV_FILE

echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                    to_yd/67_xz_BL-16_ltssm_linkup_oneta.sh                                                             0000750 0000000 0000000 00000035553 15120511737 017011  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --test)
            shift
            run_all=false
            
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                case "$1" in
                    reset_gpu)
                        reset_gpu_flag=true
                        ;;
                    set_topo)
                        set_topo_flag=true
                        ;;
                    *)
                        echo "错误：不支持的测试参数 '$1'，仅支持 reset_gpu 或 set_topo"
                        exit 1
                        ;;
                esac
                shift
            done
            ;;
        *)
            echo "错误：未知参数 '$1'，支持的参数：--node <编号> --test [reset_gpu|set_topo...]"
            exit 1
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi


LOG_FILE="logs/summary_logs/stress-4-6-8_links_onetb_recovery_CMIS-mon_$(date +%Y%m%d_%H%M%S).log"
# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1

#################################RESETGPU#########################
reset_gpu() {  
    active_remote1=$1
    gpu_count=8
    echo -e "\n------:RESET $active_remote1 GPU ------" >&2
    for ((i=1; i<=3; i++)); do
        rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
        echo "reset $active_remote1 GPU:" >&2
        echo "$rest_gpu" >&2    
        success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
        if [ $success_count -eq $gpu_count ]; then
            echo -e "times$i: GPU RESET success" >&2
            break        
        else
            echo -e "times$i: GPU RESET fail" >&2
            
        fi
        sleep 20
    done
}
############################disable_all_ltssm#########################
disable_all_ltssm() {   
    node_ip=$1    
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""')   
   
    echo "node-$node_ip disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU port是否重置成功
    success_count=$(echo "$dis_localgpu" | grep -cE '^GPU [0-7], HW\[[0-9]+\]: disable ltssm \(2, 4, 6, 8, 10\) done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "GPU RESET success"
        return 0
    else
        echo -e "GPU RESET FAIL"
        return 2
    fi    
    sleep 1
}

disable_port_ltssm() {
    node_ip=$1
    node_port=$2
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""')   
   
    echo "node-$node_ip port$node_port disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU是否重置成功    
    success_count=$(echo "$dis_gpuport" | grep -c 'HW\[[0-7]\]: disable ltssm '$node_port' done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "disable port ltssm success"
        return 0
    else
        echo -e "disable port ltssm FAIL"
        return 2
    fi    
    sleep 1
}

#################################RESETEXP#########################
reset_exp() {   
    local host="$1"
    local port="$2"  
    echo $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    sleep 3
    echo "chek EXP stat..."
    exp_state=$($exp_tool --host $host --port $port --cmd stat 2>&1)	
    echo "exp state:"
    echo "$exp_state" 
    tim_sw_time=$(echo "$exp_state" | grep "TIM_SW:" | sed -n 's/.*TIM_SW: \([^,]*\).*/\1/p')
    sec_part=$(echo "$tim_sw_time" | awk -F':' '{print $NF}')
    seconds_str=$(echo "$sec_part" | tr -d '\n' | grep -oE '[0-9]+' | head -n1)
    if [[ "$seconds_str" =~ ^[0-9]+$ ]]; then
        seconds=$((10#$seconds_str))
    else
        echo "秒数格式无效（提取值: [$seconds_str], 视为FAIL"        
        continue
    fi
    # 判断结果
    if [ "$seconds" -lt 5 ]; then        
        echo -e "EXP reset success"
        return 0
    else
        echo "TIM_SW: $seconds s, EXP reset FAIL" 
        current_success=false
        fail_reason="GPU EXP reset FAIL" 
        return 2
    fi
}

#################################load BL-16 topo#########################
load_bl16_topo() {
    node_ip=$1     
    echo -e "\n----load-gpu-topo ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_7p.json\""'
       
    output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_7p.json\""' 2>&1)
    sleep 1
    echo "load topo result:"
    echo "$output"    
}
#################################LINKUP#########################
onet_linkup() {
    local node1_ip=$1
    local node1_port=$2
    local node2_ip=$3
    local node2_port=$4   

    echo -e "\n----ONET- LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""'
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
     
    sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""' &
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
      2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

onoc_linkup() {
    local node_ip=$1
    local node_port=$2

    echo -e "\n----ONOC-LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'
            
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'  2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

#################################GPUSPEED#########################
gpu_speed() {
    host=$1
    port=$2

    #----------- check all GPU port is GEN5X8 or not-------------
    echo -e "\n----ONETA- check all GPU port $port is GEN5X8 or not ----" 
    for ((m=1; m<=3; m++)); do       
        GEN5_pattern="Port $port: GEN: 5, Width: 8"
        expected_count=8        
        link1_full=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -c\""')
        link1_GEN=$(echo "$link1_full" | grep  "Port $port" 2>&1) 
        echo -e "link1:\n $link1_GEN"
        link1_count=$(echo "$link1_GEN" | grep -c "$GEN5_pattern" 2>&1)
        echo "link1 gen5 count: $link1_count"	
        if [ $link1_count -eq $expected_count ]; then
            echo -e "RETRAIN LINK 1 GEN5X8 success"
            current_success=true
            break
        else
            echo -e "RETRAIN LINK 1 GEN5X8 FAIL" 
            current_success=false
            fail_reason="RETRAIN LINK 1 GEN5X8 FAIL" 
        fi
        sleep 6
    done
}

# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

for ((i=1; i<=$loop_times; i++)); do  
    echo -e "\n============= 压测linkup 测试第$i 轮 ==============="
    echo    
    current_success=true
    fail_reason=""
    
    ###########set 4G TOPO#############################
    if [ "$set_topo_flag" = true ]; then
        echo -e "\n====== set 4G toop ======"
        echo
        bash set_port_topo.sh $node1_bmc 2 onoc5 &
        bash set_port_topo.sh $node1_bmc 4 onoc6 &
        bash set_port_topo.sh $node1_bmc 6 onoc7 &
        bash set_port_topo.sh $node1_bmc 8 oneta &
        bash set_port_topo.sh $node2_bmc 2 onoc5 &
        bash set_port_topo.sh $node2_bmc 4 onoc6 &
        bash set_port_topo.sh $node2_bmc 6 onoc7 &
        bash set_port_topo.sh $node2_bmc 8 oneta &
        bash set_port_topo.sh $node3_bmc 2 onoc5 &
        bash set_port_topo.sh $node3_bmc 4 onoc6 &
        bash set_port_topo.sh $node3_bmc 6 onoc7 &
        bash set_port_topo.sh $node3_bmc 8 oneta &
        bash set_port_topo.sh $node4_bmc 2 onoc5 &
        bash set_port_topo.sh $node4_bmc 4 onoc6 &
        bash set_port_topo.sh $node4_bmc 6 onoc7 &
        bash set_port_topo.sh $node4_bmc 8 oneta &
        wait
        echo        
    fi

    #RESET 4G GPU
    if [ "$reset_gpu_flag" = true ]; then
        echo -e "\n====== reset 4G GPU ======"
        echo
        echo bash reset_br_gpu.sh $node1_ip
        bash reset_br_gpu.sh $node1_ip &
        bash reset_br_gpu.sh $node2_ip &
        bash reset_br_gpu.sh $node3_ip &
        bash reset_br_gpu.sh $node4_ip &
        wait
        sleep 3
        echo
    fi

    #disable_all_ltssm
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]------ disable ltssm ------"
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip")    
    for node in "${nodes[@]}"; do        
        if ! disable_port_ltssm "$node" 8; then
            current_success=false
            fail_reason+=" GPU disable port fail on node $node; "
        fi        
    done

    sleep 1
    
    #RESETEXP
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----reset exp ----" 
    (cd smbus-tool && python3 scripts/exp_cold_reset.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8)
    sleep 1
    
    #loading BL-16 topo
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip")    
    for node in "${nodes[@]}"; do        
        if ! load_bl16_topo "$node"; then
            current_success=false
            fail_reason+=" GPU load topo fail on node $node; "
        fi        
    done

    #OCS-SCREEN
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----ocs_screen before linkup----" 
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	

    #onet linkup
    if ! onet_linkup $node1_ip "8" $node2_ip "8"; then
        current_success=false
        fail_reason+=" onet port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node3_ip "8" $node4_ip "8"; then
        current_success=false
        fail_reason+=" onet port-$port linkup fail; "
    fi   
    sleep 2 
    
    #onoc linkup
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip") 
    ports=(2 4 6)   
    for node in "${nodes[@]}"; do 
        for port in "${ports[@]}"; do      
            if ! onoc_linkup $node $port; then
                current_success=false
                fail_reason+=" port-$port onoc linkup fail; "
            fi 
        done       
    done

    #ocs-screen
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 10s ocs_screen ----"
    sleep 10
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host2,$bmc_host2 --port 2,4,6,8 --skip error)	
    sleep 6
    #SLEEP 60S ocs-screen
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 60s ocs_screen ----"
    # sleep 60
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host2,$bmc_host2 --port 2,4,6,8 --skip error)	
    
    ####result###        
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]---- 本轮测试结果 ----"
    if [ "$current_success" = true ]; then
        echo -e "\n第$i轮测试成功"
    else
        echo -e "\n第$i轮测试FAIL: $fail_reason"
    fi
    
done

# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                     to_yd/64_xz-onet_swb-crosstalk-a_ocs_rssi.sh                                                        0000750 0000000 0000000 00000024610 15120511737 020146  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

first_node="$target_star_node"
second_node="$((target_star_node + 4))"
# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
node5_ip=$(jq -r ".\"$second_node\".node1_ip" "$config_file")
node5_bmc=$(jq -r ".\"$second_node\".node1_bmc" "$config_file")
node6_ip=$(jq -r ".\"$second_node\".node2_ip" "$config_file")
node6_bmc=$(jq -r ".\"$second_node\".node2_bmc" "$config_file")
node7_ip=$(jq -r ".\"$second_node\".node3_ip" "$config_file")
node7_bmc=$(jq -r ".\"$second_node\".node3_bmc" "$config_file")
node8_ip=$(jq -r ".\"$second_node\".node4_ip" "$config_file")
node8_bmc=$(jq -r ".\"$second_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)
bmc_host5=$(echo "$node5_bmc" | cut -d '.' -f 4)
bmc_host6=$(echo "$node6_bmc" | cut -d '.' -f 4)
bmc_host7=$(echo "$node7_bmc" | cut -d '.' -f 4)
bmc_host8=$(echo "$node8_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}


collect_onet_cross() {
    local active_bmc="$1"
    local active_bmcip="${!active_bmc}"  
    local _active_node="${active_bmc%_bmc}"
    local active_port=$2
    local cross_bmc=$3
    local cross_bmcip="${!cross_bmc}"  
    local _cross_node="${cross_bmc%_bmc}"
    local cross_port=$4
    local active_host="https://$active_bmcip"
    local cross_host="https://$cross_bmcip"

    local active_node=$(convert_node $_active_node $star_node)
    local cross_node=$(convert_node $_cross_node $star_node)
  
    echo -e "\n============= active_port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done  
    #disable ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    echo -e "\nwait 10s, ocs stable..."
    sleep 10
    
    echo -e "\n----- collect active_port: $active_port data -----"
    local active_port_cmis=$(ocsdiag -i $active_bmcip -e $active_port -c vcmd -p "cmis mon" 2>&1)
    local cross_port_cmis=$(ocsdiag -i $cross_bmcip -e $cross_port -c vcmd -p "cmis mon" 2>&1)
    [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
    [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $cross_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"    
    export CSV_FILE="$csv_file"
    export LOOP="$i"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$cross_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset CSV_FILE LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc active_port: $active_port 数据处理完成 ===============\n"
    sleep 2
	 echo -e "\n=============$active cross_port ===============\n"
    #active cross_port    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done	
}
LOG_FILE="logs/summary_logs/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/64_xz-onet_swb-crosstalk-a_ocs_rssi_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).csv"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

debug_mode=1

echo -e "\n====== set spnode1  topo ======"
echo
#bash set_port_topo.sh $node1_bmc 2 onoc5 &
bash set_port_topo.sh $node1_bmc 4 onetb &
bash set_port_topo.sh $node1_bmc 6 onetb &
bash set_port_topo.sh $node1_bmc 8 onoc6 &
#bash set_port_topo.sh $node2_bmc 2 onoc5 &
bash set_port_topo.sh $node2_bmc 4 onetb &
bash set_port_topo.sh $node2_bmc 6 onetb &
bash set_port_topo.sh $node2_bmc 8 onoc6 &
#bash set_port_topo.sh $node3_bmc 2 onoc5 &
bash set_port_topo.sh $node3_bmc 4 onetb &
bash set_port_topo.sh $node3_bmc 6 onetb &
bash set_port_topo.sh $node3_bmc 8 onoc6 &
#bash set_port_topo.sh $node4_bmc 2 onoc5 &
bash set_port_topo.sh $node4_bmc 4 onetb &
bash set_port_topo.sh $node4_bmc 6 onetb &
bash set_port_topo.sh $node4_bmc 8 onoc6 &
echo
echo -e "\n====== set spnode2  topo ======"
echo
#bash set_port_topo.sh $node5_bmc 2 onoc5 &
bash set_port_topo.sh $node5_bmc 4 onetb &
bash set_port_topo.sh $node5_bmc 6 onetb &
bash set_port_topo.sh $node5_bmc 8 onoc6 &
#bash set_port_topo.sh $node6_bmc 2 onoc5 &
bash set_port_topo.sh $node6_bmc 4 onetb &
bash set_port_topo.sh $node6_bmc 6 onetb &
bash set_port_topo.sh $node6_bmc 8 onoc6 &
#bash set_port_topo.sh $node7_bmc 2 onoc5 &
bash set_port_topo.sh $node7_bmc 4 onetb &
bash set_port_topo.sh $node7_bmc 6 onetb &
bash set_port_topo.sh $node7_bmc 8 onoc6 &
#bash set_port_topo.sh $node8_bmc 2 onoc5 &
bash set_port_topo.sh $node8_bmc 4 onetb &
bash set_port_topo.sh $node8_bmc 6 onetb &
bash set_port_topo.sh $node8_bmc 8 onoc6 &
wait
echo        


for ((i=1; i<=$loop_times; i++)); do
    echo -e "\n======================================================"
    echo "===================== 第 $i 轮循环 ====================="
    echo "======================================================"

    #spnode1
    collect_onet_cross node8_bmc 4 node6_bmc 4
	collect_onet_cross node6_bmc 4 node8_bmc 4
    collect_onet_cross node8_bmc 6 node5_bmc 6
	collect_onet_cross node5_bmc 6 node8_bmc 6
    collect_onet_cross node7_bmc 4 node5_bmc 4
	collect_onet_cross node5_bmc 4 node7_bmc 4
    collect_onet_cross node7_bmc 6 node6_bmc 6
	collect_onet_cross node6_bmc 6 node7_bmc 6
    #spnode2
    collect_onet_cross node4_bmc 4 node2_bmc 4
	collect_onet_cross node2_bmc 4 node4_bmc 4
    collect_onet_cross node4_bmc 6 node1_bmc 6
	collect_onet_cross node1_bmc 6 node4_bmc 6
    collect_onet_cross node3_bmc 4 node1_bmc 4
	collect_onet_cross node1_bmc 4 node3_bmc 4
    collect_onet_cross node3_bmc 6 node2_bmc 6
	collect_onet_cross node2_bmc 6 node3_bmc 6
done
echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                        to_yd/64_xz-onoc_swa-crosstalk_ocs_rssi.sh                                                          0000750 0000000 0000000 00000016714 15120511737 017726  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}


collect_onoc_cross() {
    local active_bmc="$1"
    local active_bmcip="${!active_bmc}"  
    local _active_node="${active_bmc%_bmc}"
    local active_port=$2
    local cross_bmc=$3
    local cross_bmcip="${!cross_bmc}"  
    local _cross_node="${cross_bmc%_bmc}"
    local cross_port=$4
    local active_host="https://$active_bmcip"
    local cross_host="https://$cross_bmcip"

    local active_node=$(convert_node $_active_node $star_node)
    local cross_node=$(convert_node $_cross_node $star_node)
  
    echo -e "\n============= active_port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done  
    #disable ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    echo -e "\nwait 10s, ocs stable..."
    sleep 10
    
    echo -e "\n----- collect active_port: $active_port data -----"
    local active_port_cmis=$(ocsdiag -i $active_bmcip -e $active_port -c vcmd -p "cmis mon" 2>&1)
    local cross_port_cmis=$(ocsdiag -i $cross_bmcip -e $cross_port -c vcmd -p "cmis mon" 2>&1)
    [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
    [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $cross_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"    
    export CSV_FILE="$csv_file"
    export LOOP="$i"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$cross_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset CSV_FILE LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc active_port: $active_port 数据处理完成 ===============\n"
    sleep 2  
}
LOG_FILE="logs/summary_logs/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).csv"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

debug_mode=1

for ((i=1; i<=$loop_times; i++)); do
    echo -e "\n======================================================"
    echo "===================== 第 $i 轮循环 ====================="
    echo "======================================================"

    #port 8
    collect_onoc_cross node1_bmc 8 node2_bmc 8    
    collect_onoc_cross node2_bmc 8 node1_bmc 8
    collect_onoc_cross node3_bmc 8 node4_bmc 8
    collect_onoc_cross node4_bmc 8 node3_bmc 8
    #port4
    collect_onoc_cross node1_bmc 4 node3_bmc 4    
    collect_onoc_cross node3_bmc 4 node1_bmc 4
    collect_onoc_cross node2_bmc 4 node4_bmc 4
    collect_onoc_cross node4_bmc 4 node2_bmc 4
    #port6
    collect_onoc_cross node1_bmc 6 node4_bmc 6    
    collect_onoc_cross node4_bmc 6 node1_bmc 6
    collect_onoc_cross node2_bmc 6 node3_bmc 6
    collect_onoc_cross node3_bmc 6 node2_bmc 6
    
done
echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                    to_yd/66_xz_BL-32_ltssm_linkup_oneta.sh                                                             0000750 0000000 0000000 00000037541 15120511737 017005  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# 基础配置（非节点相关）
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

# 功能参数标识
reset_gpu_flag=false
set_topo_flag=false
run_all=true 

# 节点配置相关变量
config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

#################################参数解析#################################
while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            # 校验--node参数有效性
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --test)
            shift
            run_all=false
            # 处理--test后多个参数
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                case "$1" in
                    reset_gpu)
                        reset_gpu_flag=true
                        ;;
                    set_topo)
                        set_topo_flag=true
                        ;;
                    *)
                        echo "错误：不支持的测试参数 '$1'，仅支持 reset_gpu 或 set_topo"
                        exit 1
                        ;;
                esac
                shift
            done
            ;;
        *)
            echo "错误：未知参数 '$1'，支持的参数：--node <编号> --test [reset_gpu|set_topo...]"
            exit 1
            ;;
    esac
done

#################################加载JSON配置#################################
# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

LOG_FILE="logs/summary_logs/stress-4-6-8_links_onetb_recovery_CMIS-mon_$(date +%Y%m%d_%H%M%S).log"
# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1

#################################RESETGPU#########################
reset_gpu() {  
    active_remote1=$1
    gpu_count=8
    echo -e "\n------:RESET $active_remote1 GPU ------" >&2
    for ((i=1; i<=3; i++)); do
        rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
        echo "reset $active_remote1 GPU:" >&2
        echo "$rest_gpu" >&2    
        success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
        if [ $success_count -eq $gpu_count ]; then
            echo -e "times$i: GPU RESET success" >&2
            break        
        else
            echo -e "times$i: GPU RESET fail" >&2            
        fi
        sleep 20
    done
}

############################disable_all_ltssm#########################
disable_all_ltssm() {   
    node_ip=$1    
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""')   
   
    echo "node-$node_ip disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU port是否重置成功
    success_count=$(echo "$dis_localgpu" | grep -cE '^GPU [0-7], HW\[[0-9]+\]: disable ltssm \(2, 4, 6, 8, 10\) done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "GPU RESET success"
        return 0
    else
        echo -e "GPU RESET FAIL"
        return 2
    fi    
    sleep 1
}

disable_port_ltssm() {
    node_ip=$1
    node_port=$2
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""')   
   
    echo "node-$node_ip port$node_port disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU是否重置成功    
    success_count=$(echo "$dis_gpuport" | grep -c 'HW\[[0-7]\]: disable ltssm '$node_port' done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "disable port ltssm success"
        return 0
    else
        echo -e "disable port ltssm FAIL"
        return 2
    fi    
    sleep 1
}

#################################RESETEXP#########################
reset_exp() {   
    local host="$1"
    local port="$2"  
    echo $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    sleep 3
    echo "chek EXP stat..."
    exp_state=$($exp_tool --host $host --port $port --cmd stat 2>&1)	
    echo "exp state:"
    echo "$exp_state" 
    tim_sw_time=$(echo "$exp_state" | grep "TIM_SW:" | sed -n 's/.*TIM_SW: \([^,]*\).*/\1/p')
    sec_part=$(echo "$tim_sw_time" | awk -F':' '{print $NF}')
    seconds_str=$(echo "$sec_part" | tr -d '\n' | grep -oE '[0-9]+' | head -n1)
    if [[ "$seconds_str" =~ ^[0-9]+$ ]]; then
        seconds=$((10#$seconds_str))
    else
        echo "秒数格式无效（提取值: [$seconds_str], 视为FAIL"        
        continue
    fi
    # 判断结果
    if [ "$seconds" -lt 5 ]; then        
        echo -e "EXP reset success"
        return 0
    else
        echo "TIM_SW: $seconds s, EXP reset FAIL" 
        current_success=false
        fail_reason="GPU EXP reset FAIL" 
        return 2
    fi
}

#################################load gpu topo#########################
load_gputopo() {
    node_ip=$1 
    gpu_topo=$2
    echo -e "\n----load-gpu-topo ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""'
       
    output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""' 2>&1)
    sleep 1
    echo "load topo result:"
    echo "$output" 
    ## check all HWID port ready or not
    # hwid_count=8
    # link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    # if [ $link1_count -eq $hwid_count ]; then
    #     echo -e "RETRAIN LINK 1 success"
    #     return 0 
    # else
    #     echo "RETRAIN LINK 1 ready FAIL"
    #     return 0 
    # fi 
}

#################################LINKUP#########################
onet_linkup() {
    local node1_ip=$1
    local node1_port=$2
    local node2_ip=$3
    local node2_port=$4   

    echo -e "\n----ONET- LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""'
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
     
    sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""' &
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
      2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

onoc_linkup() {
    local node_ip=$1
    local node_port=$2

    echo -e "\n----ONOC-LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'
            
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'  2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

#################################GPUSPEED#########################
gpu_speed() {
    host=$1
    port=$2

    #----------- check all GPU port is GEN5X8 or not-------------
    echo -e "\n----ONETA- check all GPU port $port is GEN5X8 or not ----" 
    for ((m=1; m<=3; m++)); do       
        GEN5_pattern="Port $port: GEN: 5, Width: 8"
        expected_count=8        
        link1_full=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -c\""')
        link1_GEN=$(echo "$link1_full" | grep  "Port $port" 2>&1) 
        echo -e "link1:\n $link1_GEN"
        link1_count=$(echo "$link1_GEN" | grep -c "$GEN5_pattern" 2>&1)
        echo "link1 gen5 count: $link1_count"	
        if [ $link1_count -eq $expected_count ]; then
            echo -e "RETRAIN LINK 1 GEN5X8 success"
            current_success=true
            break
        else
            echo -e "RETRAIN LINK 1 GEN5X8 FAIL" 
            current_success=false
            fail_reason="RETRAIN LINK 1 GEN5X8 FAIL" 
        fi
        sleep 6
    done
}

# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

for ((i=1; i<=$loop_times; i++)); do  
    echo -e "\n============= 压测linkup 测试第$i 轮 ==============="
    echo    
    current_success=true
    fail_reason=""
        
    ###########set 4G TOPO#############################
    if [ "$set_topo_flag" = true ]; then
        echo -e "\n====== set 4G toop ======"
        echo
        bash set_port_topo.sh $node1_bmc 2 onoc5 &
        bash set_port_topo.sh $node1_bmc 4 oneta &
        bash set_port_topo.sh $node1_bmc 6 oneta &
        bash set_port_topo.sh $node1_bmc 8 onoc6 &
        bash set_port_topo.sh $node2_bmc 2 onoc5 &
        bash set_port_topo.sh $node2_bmc 4 oneta &
        bash set_port_topo.sh $node2_bmc 6 oneta &
        bash set_port_topo.sh $node2_bmc 8 onoc6 &
        bash set_port_topo.sh $node3_bmc 2 onoc5 &
        bash set_port_topo.sh $node3_bmc 4 oneta &
        bash set_port_topo.sh $node3_bmc 6 oneta &
        bash set_port_topo.sh $node3_bmc 8 onoc6 &
        bash set_port_topo.sh $node4_bmc 2 onoc5 &
        bash set_port_topo.sh $node4_bmc 4 oneta &
        bash set_port_topo.sh $node4_bmc 6 oneta &
        bash set_port_topo.sh $node4_bmc 8 onoc6 &
        wait
        echo        
    fi

    #RESET 4G GPU
    echo -e "\n====== reset 4G GPU ======"
    if [ "$reset_gpu_flag" = true ]; then        
        echo
        echo bash reset_br_gpu.sh $node1_ip
        bash reset_br_gpu.sh $node1_ip &
        bash reset_br_gpu.sh $node2_ip &
        bash reset_br_gpu.sh $node3_ip &
        bash reset_br_gpu.sh $node4_ip &
        wait
        sleep 3
        echo
    fi

    #disable_all_ltssm
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]------ RESET  GPU ------"
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip")    
    for node in "${nodes[@]}"; do        
        if ! disable_all_ltssm "$node"; then
            current_success=false
            fail_reason+=" GPU disable port fail on node $node; "
        fi        
    done

    sleep 1
    
    #RESETEXP
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----reset exp ----" 
    (cd smbus-tool && python3 scripts/exp_cold_reset.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8)
    sleep 1
    
    #loading GPU topo
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip")    
    for node in "${nodes[@]}"; do        
        if ! load_gputopo "$node" "/workspace/zds/mesh_6p_32.json"; then
            current_success=false
            fail_reason+=" GPU load topo fail on node $node; "
        fi        
    done

    # #OCS-SCREEN
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----ocs_screen before linkup----" 
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	

    #onet linkup
    if ! onet_linkup $node1_ip "4" $node3_ip "4"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node2_ip "4" $node4_ip "4"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi   
    sleep 2 
    if ! onet_linkup $node1_ip "6" $node4_ip "6"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node2_ip "6" $node3_ip "6"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi   
    sleep 2 

    #onoc linkup
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip") 
    ports=(2 8)   
    for node in "${nodes[@]}"; do 
        for port in "${ports[@]}"; do      
            if ! onoc_linkup $node $port; then
                current_success=false
                fail_reason+=" port-$port onoc linkup fail; "
                sleep 2
            fi 
        done       
    done

    #ocs-screen
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 10s ocs_screen ----"
    sleep 10
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	
    sleep 6
    # #SLEEP 60S ocs-screen
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 60s ocs_screen ----"
    # sleep 60
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	
    
    ####result###        
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]---- 本轮测试结果 ----"
    if [ "$current_success" = true ]; then
        echo -e "\n第$i轮测试成功"
    else
        echo -e "\n第$i轮测试FAIL: $fail_reason"
    fi
    
done

# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                               to_yd/n18_xz_ltssm_linkup_onoc_stress.sh                                                            0000750 0000000 0000000 00000036114 15120511737 017607  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# 基础配置（非节点相关）
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

# 功能参数标识
reset_gpu_flag=false
set_topo_flag=false
run_all=true 

# 节点配置相关变量
config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

#################################参数解析#################################
while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            # 校验--node参数有效性
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --test)
            shift
            run_all=false
            # 处理--test后多个参数
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                case "$1" in
                    reset_gpu)
                        reset_gpu_flag=true
                        ;;
                    set_topo)
                        set_topo_flag=true
                        ;;
                    *)
                        echo "错误：不支持的测试参数 '$1'，仅支持 reset_gpu 或 set_topo"
                        exit 1
                        ;;
                esac
                shift
            done
            ;;
        *)
            echo "错误：未知参数 '$1'，支持的参数：--node <编号> --test [reset_gpu|set_topo...]"
            exit 1
            ;;
    esac
done

#################################加载JSON配置#################################
# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

LOG_FILE="logs/summary_logs/n18_onoc_linkup_stress_$(date +%Y%m%d_%H%M%S).log"
# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1

#################################RESETGPU#########################
reset_gpu() {  
    active_remote1=$1
    gpu_count=8
    echo -e "\n------:RESET $active_remote1 GPU ------" >&2
    for ((i=1; i<=3; i++)); do
        rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
        echo "reset $active_remote1 GPU:" >&2
        echo "$rest_gpu" >&2    
        success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
        if [ $success_count -eq $gpu_count ]; then
            echo -e "times$i: GPU RESET success" >&2
            break        
        else
            echo -e "times$i: GPU RESET fail" >&2            
        fi
        sleep 20
    done
}

############################disable_all_ltssm#########################
disable_all_ltssm() {   
    node_ip=$1    
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""')   
   
    echo "node-$node_ip disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU port是否重置成功
    success_count=$(echo "$dis_localgpu" | grep -cE '^GPU [0-7], HW\[[0-9]+\]: disable ltssm \(2, 4, 6, 8, 10\) done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "GPU RESET success"
        return 0
    else
        echo -e "GPU RESET FAIL"
        return 2
    fi    
    sleep 1
}

disable_port_ltssm() {
    node_ip=$1
    node_port=$2
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""')   
   
    echo "node-$node_ip port$node_port disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU是否重置成功    
    success_count=$(echo "$dis_gpuport" | grep -c 'HW\[[0-7]\]: disable ltssm '$node_port' done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "disable port ltssm success"
        return 0
    else
        echo -e "disable port ltssm FAIL"
        return 2
    fi    
    sleep 1
}

#################################RESETEXP#########################
reset_exp() {   
    local host="$1"
    local port="$2"  
    echo $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    sleep 3
    echo "chek EXP stat..."
    exp_state=$($exp_tool --host $host --port $port --cmd stat 2>&1)	
    echo "exp state:"
    echo "$exp_state" 
    tim_sw_time=$(echo "$exp_state" | grep "TIM_SW:" | sed -n 's/.*TIM_SW: \([^,]*\).*/\1/p')
    sec_part=$(echo "$tim_sw_time" | awk -F':' '{print $NF}')
    seconds_str=$(echo "$sec_part" | tr -d '\n' | grep -oE '[0-9]+' | head -n1)
    if [[ "$seconds_str" =~ ^[0-9]+$ ]]; then
        seconds=$((10#$seconds_str))
    else
        echo "秒数格式无效（提取值: [$seconds_str], 视为FAIL"        
        continue
    fi
    # 判断结果
    if [ "$seconds" -lt 5 ]; then        
        echo -e "EXP reset success"
        return 0
    else
        echo "TIM_SW: $seconds s, EXP reset FAIL" 
        current_success=false
        fail_reason="GPU EXP reset FAIL" 
        return 2
    fi
}

#################################load gpu topo#########################
load_gputopo() {
    node_ip=$1 
    gpu_topo=$2
    echo -e "\n----load-gpu-topo ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""'
       
    output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""' 2>&1)
    sleep 1
    echo "load topo result:"
    echo "$output" 
    ## check all HWID port ready or not
    # hwid_count=8
    # link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    # if [ $link1_count -eq $hwid_count ]; then
    #     echo -e "RETRAIN LINK 1 success"
    #     return 0 
    # else
    #     echo "RETRAIN LINK 1 ready FAIL"
    #     return 0 
    # fi 
}

#################################LINKUP#########################
onet_linkup() {
    local node1_ip=$1
    local node1_port=$2
    local node2_ip=$3
    local node2_port=$4   

    echo -e "\n----ONET- LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""'
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
     
    sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""' &
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
      2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

onoc_linkup() {
    local node_ip=$1
    local node_port=$2

    echo -e "\n----ONOC-LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'
            
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'  2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

#################################GPUSPEED#########################
gpu_speed() {
    host=$1
    port=$2

    #----------- check all GPU port is GEN5X8 or not-------------
    echo -e "\n----ONETA- check all GPU port $port is GEN5X8 or not ----" 
    for ((m=1; m<=3; m++)); do       
        GEN5_pattern="Port $port: GEN: 5, Width: 8"
        expected_count=8        
        link1_full=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -c\""')
        link1_GEN=$(echo "$link1_full" | grep  "Port $port" 2>&1) 
        echo -e "link1:\n $link1_GEN"
        link1_count=$(echo "$link1_GEN" | grep -c "$GEN5_pattern" 2>&1)
        echo "link1 gen5 count: $link1_count"	
        if [ $link1_count -eq $expected_count ]; then
            echo -e "RETRAIN LINK 1 GEN5X8 success"
            current_success=true
            break
        else
            echo -e "RETRAIN LINK 1 GEN5X8 FAIL" 
            current_success=false
            fail_reason="RETRAIN LINK 1 GEN5X8 FAIL" 
        fi
        sleep 6
    done
}

# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

for ((i=1; i<=$loop_times; i++)); do  
    echo -e "\n============= 压测linkup 测试第$i 轮 ==============="
    echo    
    current_success=true
    fail_reason=""
        
    ###########set 4G TOPO#############################
    if [ "$set_topo_flag" = true ]; then
        echo -e "\n====== set 4G toop ======"
        echo
        bash set_port_topo.sh $node2_bmc 2 onoc5 &
        #bash set_port_topo.sh $node2_bmc 4 onoc6 &
        #bash set_port_topo.sh $node2_bmc 6 onoc7 &
        bash set_port_topo.sh $node2_bmc 8 onoc6 &
        wait
        echo        
    fi

    #RESET 4G GPU
    echo -e "\n====== reset 4G GPU ======"
    if [ "$reset_gpu_flag" = true ]; then        
        echo
        echo bash reset_br_gpu.sh $node1_ip
        #bash reset_br_gpu.sh $node1_ip &
        bash reset_br_gpu.sh $node2_ip &
        #bash reset_br_gpu.sh $node3_ip &
        #bash reset_br_gpu.sh $node4_ip &
        wait
        sleep 3
        echo
    fi

    #disable_all_ltssm
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]------ RESET  GPU ------"
    nodes=("$node2_ip")    
    for node in "${nodes[@]}"; do        
        if ! disable_all_ltssm "$node"; then
            current_success=false
            fail_reason+=" GPU disable port fail on node $node; "
        fi        
    done

    sleep 1
    
    #RESETEXP
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----reset exp ----" 
    (cd smbus-tool && python3 scripts/exp_cold_reset.py --host $bmc_host2 --port 2,8)
    sleep 1
    
    #loading GPU topo
    nodes=("$node2_ip")    
    for node in "${nodes[@]}"; do        
        if ! load_gputopo "$node" "/workspace/zds/mesh_6p_32.json"; then
            current_success=false
            fail_reason+=" GPU load topo fail on node $node; "
        fi        
    done

    # #OCS-SCREEN
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----ocs_screen before linkup----" 
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	

    #onet linkup
    #if ! onet_linkup $node1_ip "4" $node3_ip "4"; then
    #    current_success=false
    #    fail_reason+=" port-$port linkup fail; "
    #fi  
    #sleep 2  
    #if ! onet_linkup $node2_ip "4" $node4_ip "4"; then
    #    current_success=false
    #    fail_reason+=" port-$port linkup fail; "
    #fi   
    #sleep 2 
    #if ! onet_linkup $node1_ip "6" $node4_ip "6"; then
    #    current_success=false
    #    fail_reason+=" port-$port linkup fail; "
    #fi  
    #sleep 2  
    #if ! onet_linkup $node2_ip "6" $node3_ip "6"; then
    #    current_success=false
    #    fail_reason+=" port-$port linkup fail; "
    #fi   
    #sleep 2 

    #onoc linkup
    nodes=("$node2_ip") 
    ports=(2 8)   
    for node in "${nodes[@]}"; do 
        for port in "${ports[@]}"; do      
            if ! onoc_linkup $node $port; then
                current_success=false
                fail_reason+=" port-$port onoc linkup fail; "
                sleep 2
            fi 
        done       
    done

    #ocs-screen
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 10s ocs_screen ----"
    sleep 10
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host2 --port 2,8 --skip error)	
    sleep 6
    # #SLEEP 60S ocs-screen
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 60s ocs_screen ----"
    # sleep 60
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	
    
    ####result###        
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]---- 本轮测试结果 ----"
    if [ "$current_success" = true ]; then
        echo -e "\n第$i轮测试成功"
    else
        echo -e "\n第$i轮测试FAIL: $fail_reason"
    fi
    
done

# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                                                                                                                                                                                                                                                                                                                    to_yd/69_xz_BL-16_reset-gpu_linkup_oneta.sh                                                         0000750 0000000 0000000 00000033717 15120511737 017564  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --test)
            shift
            run_all=false
            
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                case "$1" in
                    set_topo)
                        set_topo_flag=true
                        ;;
                    *)
                        echo "错误：不支持的测试参数 '$1'，仅支持 set_topo"
                        exit 1
                        ;;
                esac
                shift
            done
            ;;
        *)
            echo "错误：未知参数 '$1'，支持的参数：--node <编号> --test [reset_gpu|set_topo...]"
            exit 1
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi


LOG_FILE="logs/summary_logs/stress-4-6-8_links_onetb_recovery_CMIS-mon_$(date +%Y%m%d_%H%M%S).log"
# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1

#################################RESETGPU#########################
reset_gpu() {  
    active_remote1=$1
    gpu_count=8
    echo -e "\n------:RESET $active_remote1 GPU ------" >&2
    for ((i=1; i<=3; i++)); do
        rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
        echo "reset $active_remote1 GPU:" >&2
        echo "$rest_gpu" >&2    
        success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
        if [ $success_count -eq $gpu_count ]; then
            echo -e "times$i: GPU RESET success" >&2
            break        
        else
            echo -e "times$i: GPU RESET fail" >&2
            
        fi
        sleep 20
    done
}
############################disable_all_ltssm#########################
disable_all_ltssm() {   
    node_ip=$1    
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""')   
   
    echo "node-$node_ip disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU port是否重置成功
    success_count=$(echo "$dis_localgpu" | grep -cE '^GPU [0-7], HW\[[0-9]+\]: disable ltssm \(2, 4, 6, 8, 10\) done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "GPU RESET success"
        return 0
    else
        echo -e "GPU RESET FAIL"
        return 2
    fi    
    sleep 1
}

disable_port_ltssm() {
    node_ip=$1
    node_port=$2
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""')   
   
    echo "node-$node_ip port$node_port disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU是否重置成功    
    success_count=$(echo "$dis_gpuport" | grep -c 'HW\[[0-7]\]: disable ltssm '$node_port' done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "disable port ltssm success"
        return 0
    else
        echo -e "disable port ltssm FAIL"
        return 2
    fi    
    sleep 1
}

#################################RESETEXP#########################
reset_exp() {   
    local host="$1"
    local port="$2"  
    echo $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    sleep 3
    echo "chek EXP stat..."
    exp_state=$($exp_tool --host $host --port $port --cmd stat 2>&1)	
    echo "exp state:"
    echo "$exp_state" 
    tim_sw_time=$(echo "$exp_state" | grep "TIM_SW:" | sed -n 's/.*TIM_SW: \([^,]*\).*/\1/p')
    sec_part=$(echo "$tim_sw_time" | awk -F':' '{print $NF}')
    seconds_str=$(echo "$sec_part" | tr -d '\n' | grep -oE '[0-9]+' | head -n1)
    if [[ "$seconds_str" =~ ^[0-9]+$ ]]; then
        seconds=$((10#$seconds_str))
    else
        echo "秒数格式无效（提取值: [$seconds_str], 视为FAIL"        
        continue
    fi
    # 判断结果
    if [ "$seconds" -lt 5 ]; then        
        echo -e "EXP reset success"
        return 0
    else
        echo "TIM_SW: $seconds s, EXP reset FAIL" 
        current_success=false
        fail_reason="GPU EXP reset FAIL" 
        return 2
    fi
}

#################################load BL-16 topo#########################
load_bl16_topo() {
    node_ip=$1     
    echo -e "\n----load-gpu-topo ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_7p.json\""'
       
    output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_7p.json\""' 2>&1)
    sleep 1
    echo "load topo result:"
    echo "$output"    
}
#################################LINKUP#########################
onet_linkup() {
    local node1_ip=$1
    local node1_port=$2
    local node2_ip=$3
    local node2_port=$4   

    echo -e "\n----ONET- LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""'
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
     
    sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""' &
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
      2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

onoc_linkup() {
    local node_ip=$1
    local node_port=$2

    echo -e "\n----ONOC-LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'
            
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'  2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

#################################GPUSPEED#########################
gpu_speed() {
    host=$1
    port=$2

    #----------- check all GPU port is GEN5X8 or not-------------
    echo -e "\n----ONETA- check all GPU port $port is GEN5X8 or not ----" 
    for ((m=1; m<=3; m++)); do       
        GEN5_pattern="Port $port: GEN: 5, Width: 8"
        expected_count=8        
        link1_full=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -c\""')
        link1_GEN=$(echo "$link1_full" | grep  "Port $port" 2>&1) 
        echo -e "link1:\n $link1_GEN"
        link1_count=$(echo "$link1_GEN" | grep -c "$GEN5_pattern" 2>&1)
        echo "link1 gen5 count: $link1_count"	
        if [ $link1_count -eq $expected_count ]; then
            echo -e "RETRAIN LINK 1 GEN5X8 success"
            current_success=true
            break
        else
            echo -e "RETRAIN LINK 1 GEN5X8 FAIL" 
            current_success=false
            fail_reason="RETRAIN LINK 1 GEN5X8 FAIL" 
        fi
        sleep 6
    done
}

# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

for ((i=1; i<=$loop_times; i++)); do  
    echo -e "\n============= 压测linkup 测试第$i 轮 ==============="
    echo    
    current_success=true
    fail_reason=""
    
    ###########set 4G TOPO#############################
    if [ "$set_topo_flag" = true ]; then
        echo -e "\n====== set 4G toop ======"
        echo
        bash set_port_topo.sh $node1_bmc 2 onoc5 &
        bash set_port_topo.sh $node1_bmc 4 onoc6 &
        bash set_port_topo.sh $node1_bmc 6 onoc7 &
        bash set_port_topo.sh $node1_bmc 8 oneta &
        bash set_port_topo.sh $node2_bmc 2 onoc5 &
        bash set_port_topo.sh $node2_bmc 4 onoc6 &
        bash set_port_topo.sh $node2_bmc 6 onoc7 &
        bash set_port_topo.sh $node2_bmc 8 oneta &
        bash set_port_topo.sh $node3_bmc 2 onoc5 &
        bash set_port_topo.sh $node3_bmc 4 onoc6 &
        bash set_port_topo.sh $node3_bmc 6 onoc7 &
        bash set_port_topo.sh $node3_bmc 8 oneta &
        bash set_port_topo.sh $node4_bmc 2 onoc5 &
        bash set_port_topo.sh $node4_bmc 4 onoc6 &
        bash set_port_topo.sh $node4_bmc 6 onoc7 &
        bash set_port_topo.sh $node4_bmc 8 oneta &
        wait
        echo        
    fi

    #RESET 4G GPU
    echo -e "\n====== reset 4G GPU ======"
    echo
    echo bash reset_br_gpu.sh $node1_ip
    bash reset_br_gpu.sh $node1_ip &
    bash reset_br_gpu.sh $node2_ip &
    bash reset_br_gpu.sh $node3_ip &
    bash reset_br_gpu.sh $node4_ip &
    wait
    sleep 3
    echo

    #RESETEXP
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----reset exp ----" 
    (cd smbus-tool && python3 scripts/exp_cold_reset.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8)
    sleep 1
 
    #OCS-SCREEN
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----ocs_screen before linkup----" 
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	

    #onet linkup
    if ! onet_linkup $node1_ip "8" $node2_ip "8"; then
        current_success=false
        fail_reason+=" onet port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node3_ip "8" $node4_ip "8"; then
        current_success=false
        fail_reason+=" onet port-$port linkup fail; "
    fi   
    sleep 2 
    
    #onoc linkup
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip") 
    ports=(2 4 6)   
    for node in "${nodes[@]}"; do 
        for port in "${ports[@]}"; do      
            if ! onoc_linkup $node $port; then
                current_success=false
                fail_reason+=" port-$port onoc linkup fail; "
            fi 
        done       
    done

    #ocs-screen
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 10s ocs_screen ----"
    sleep 10
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host2,$bmc_host2 --port 2,4,6,8 --skip error)	
    sleep 6
    #SLEEP 60S ocs-screen
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 60s ocs_screen ----"
    # sleep 60
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host2,$bmc_host2 --port 2,4,6,8 --skip error)	
    
    ####result###        
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]---- 本轮测试结果 ----"
    if [ "$current_success" = true ]; then
        echo -e "\n第$i轮测试成功"
    else
        echo -e "\n第$i轮测试FAIL: $fail_reason"
    fi
    
done

# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                 to_yd/.vscode/                                                                                      0000750 0000000 0000000 00000000000 15120511737 012230  5                                                                                                    ustar   root                            root                                                                                                                                                                                                                   to_yd/.vscode/launch.json                                                                           0000750 0000000 0000000 00000000772 15120511737 014406  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false
        }
    ]
}      to_yd/node_configs.json                                                                             0000750 0000000 0000000 00000003222 15120511740 014213  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   {
  "1": {
    "node1_ip": "10.57.216.162",
    "node1_bmc": "10.57.216.91",
    "node2_ip": "10.57.216.170",
    "node2_bmc": "10.57.216.92",
    "node3_ip": "10.57.216.175",
    "node3_bmc": "10.57.216.93",
    "node4_ip": "10.57.216.154",
    "node4_bmc": "10.57.216.94"
  },
  "5": {
    "node1_ip": "10.57.216.174",
    "node1_bmc": "10.57.216.95",
    "node2_ip": "10.57.216.168",
    "node2_bmc": "10.57.216.96",
    "node3_ip": "10.57.216.150",
    "node3_bmc": "10.57.216.97",
    "node4_ip": "10.57.216.185",
    "node4_bmc": "10.57.216.98"
  },
  "9": {
    "node1_ip": "10.57.216.180",
    "node1_bmc": "10.57.216.99",
    "node2_ip": "10.57.216.176",
    "node2_bmc": "10.57.216.100",
    "node3_ip": "10.57.216.145",
    "node3_bmc": "10.57.216.101",
    "node4_ip": "10.57.216.134",
    "node4_bmc": "10.57.216.102"
  },
  "13": {
    "node1_ip": "10.57.216.157",
    "node1_bmc": "10.57.216.103",
    "node2_ip": "10.57.216.188",
    "node2_bmc": "10.57.216.104",
    "node3_ip": "10.57.216.156",
    "node3_bmc": "10.57.216.105",
    "node4_ip": "10.57.216.151",
    "node4_bmc": "10.57.216.106"
  },
  "17": {
    "node1_ip": "10.57.216.167",
    "node1_bmc": "10.57.216.107",
    "node2_ip": "10.57.216.177",
    "node2_bmc": "10.57.216.108",
    "node3_ip": "10.57.216.139",
    "node3_bmc": "10.57.216.109",
    "node4_ip": "10.57.216.163",
    "node4_bmc": "10.57.216.110"
  },
  "21": {
    "node1_ip": "10.57.216.148",
    "node1_bmc": "10.57.216.111",
    "node2_ip": "10.57.216.187",
    "node2_bmc": "10.57.216.112",
    "node3_ip": "10.57.216.165",
    "node3_bmc": "10.57.216.113",
    "node4_ip": "10.57.216.166",
    "node4_bmc": "10.57.216.114"
  }
}                                                                                                                                                                                                                                                                                                                                                                              to_yd/collect_test_data_onoc_rssi.py                                                                0000750 0000000 0000000 00000022554 15120511737 017007  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   import os
import re
import pandas as pd

def extract_cmis_params(cmis_mon_str):
    """
    从 CMIS 监控字符串中提取目标参数：
    - 移除 ANSI 控制字符、不可见字符
    - 修复字段断裂问题，确保 100% 匹配
    """
    # 关键步骤1：移除 ANSI 颜色控制字符（如 [0m、[34m 等）
    cmis_mon_str = re.sub(r'\x1B\[[0-9;]*[mK]', '', cmis_mon_str)
    # 关键步骤2：移除所有不可见字符（\r、制表符、控制字符）
    cmis_mon_str = re.sub(r'[\x00-\x1F\x7F]', '', cmis_mon_str)
    # 关键步骤3：替换多个连续空格为单个空格，补全可能断裂的字段（如 TEMP 被拆分）
    cmis_mon_str = re.sub(r'\s+', ' ', cmis_mon_str).strip()
    # 额外处理：确保 CMIS 关键字后紧跟数字（避免字段粘连）
    cmis_mon_str = re.sub(r'CMIS(\d+)', r'CMIS \1', cmis_mon_str)
    cmis_mon_str = re.sub(r'CH(\d+)', r'CH \1', cmis_mon_str)

    patterns = {}

    # ---------------- 温度模式：兼容字段修复后的格式 ----------------
    for cmis_num in range(1, 9):
        patterns[f"cmis{cmis_num}_temp"] = re.compile(
            rf"CMIS {cmis_num} TEMP: ([+-]?\d+(\.\d+)?) C",
            re.IGNORECASE
        )

    # ---------------- 通道参数模式：保持独立匹配逻辑 ----------------
    for cmis_num in range(1, 9):
        for ch_num in range(1, 9):
            patterns[f"cmis{cmis_num}_ch{ch_num}_line"] = re.compile(
                rf"CMIS {cmis_num} CH {ch_num}: (.*?)(?= CMIS |$)",  # 注意 CH 后加空格（修复后格式）
                re.DOTALL | re.IGNORECASE
            )
            patterns[f"cmis{cmis_num}_ch{ch_num}_tx"] = re.compile(r"TX ([+-]?\d+(\.\d+)?) dBm", re.IGNORECASE)
            patterns[f"cmis{cmis_num}_ch{ch_num}_ibias"] = re.compile(r"IBIAS ([+-]?\d+(\.\d+)?) mA", re.IGNORECASE)
    
    result = {}

    # ---------------- 提取温度（带调试日志） ----------------
    for cmis_num in range(1, 9):
        temp_key = f"cmis{cmis_num}_temp"
        temp_match = patterns[temp_key].search(cmis_mon_str)
        result[temp_key] = temp_match.group(1) if temp_match else None
        if result[temp_key]:
            print(f"成功匹配 CMIS {cmis_num} TEMP: {result[temp_key]} C")
        else:
            start_idx = cmis_mon_str.find(f"CMIS {cmis_num}")
            end_idx = cmis_mon_str.find(f"CMIS {cmis_num+1}") if cmis_num < 8 else len(cmis_mon_str)
            target_block = cmis_mon_str[start_idx:end_idx].strip()
            print(f"警告：未匹配到 CMIS {cmis_num} TEMP，当前区块内容：{target_block}")

    # ---------------- 提取通道参数 ----------------
    for cmis_num in range(1, 9):
        for ch_num in range(1, 9):
            tx_key = f"cmis{cmis_num}_ch{ch_num}_TX"
            rx_key = f"cmis{cmis_num}_ch{ch_num}_RX"
            ibias_key = f"cmis{cmis_num}_ch{ch_num}_laser_ibias"

            line_pattern = patterns[f"cmis{cmis_num}_ch{ch_num}_line"]
            line_match = line_pattern.search(cmis_mon_str)
            if not line_match:
                result[tx_key] = None
                result[ibias_key] = None
            else:
                line_content = line_match.group(1)
                tx_match = patterns[f"cmis{cmis_num}_ch{ch_num}_tx"].search(line_content)
                ibias_match = patterns[f"cmis{cmis_num}_ch{ch_num}_ibias"].search(line_content)
                result[tx_key] = tx_match.group(1) if tx_match else None
                result[ibias_key] = ibias_match.group(1) if ibias_match else None
            result[rx_key] = None

    return result


def parse_rssi_data(rssi_str):
    """
    解析 RSSI 文本数据，生成 CMIS 1~8 各 CH1~8 的 RX 映射字典
    参数：rssi_str - 包含 ocs1~ocs8 数据的文本字符串
    返回：rssi_dict - {cmis_num: [CH1_RX, CH2_RX, ..., CH8_RX], ...}（cmis_num 1~8）
    """
    rssi_dict = {cmis_num: [None for _ in range(8)] for cmis_num in range(1, 9)}
    rssi_lines = [line.strip() for line in rssi_str.split("\n") if line.strip()]
    
    for line in rssi_lines:
        # ---------------- 关键修复：匹配方括号包裹的数值 + 优化匹配逻辑 ----------------
        # 1. \[: 匹配 RSSI_DBM 后的左括号

        match = re.search(
            r".*?ocs(\d+)\.[^:]+:.*?RSSI_DBM:\s*\[\s*([+-]?\d+\.\d+(?:\s*,\s*[+-]?\d+\.\d+)*)\s*\]",
            line
        )
        if not match:
            # 忽略 Namespace 行（正常现象），其他未匹配行才打印
            if "Namespace" not in line:
                print(f"未匹配的RSSI行：{line}")
            continue
        
        cmis_num = int(match.group(1))
        if cmis_num < 1 or cmis_num > 8:
            continue
        
        # 按「逗号+空格」分割数值（兼容纯逗号/多空格）
        rssi_dbm_list = re.split(r"\s*,\s*", match.group(2))
        # 确保只取前8个通道（防止数据异常）
        for ch_idx in range(min(8, len(rssi_dbm_list))):
            rssi_dict[cmis_num][ch_idx] = rssi_dbm_list[ch_idx]
    
    return rssi_dict

def main():

    # 1. 从环境变量读取基础参数（原逻辑保留）
    csv_file = os.getenv("CSV_FILE", "")    
    loop = os.getenv("LOOP", "")    
    active_port = os.getenv("ACTIVE_PORT", "")   
    cross_port = os.getenv("CROSS_PORT", "")
    
    # 2. 从环境变量读取各端口的 CMIS 监控数据（原逻辑保留）
    active_port_cmis = os.getenv("ACTIVE_PORT_CMIS", "")
    cross_port_cmis = os.getenv("CROSS_PORT_CMIS", "")

    
    # 3. 从环境变量读取各端口的 RSSI 数据（用于提取 RX）
    active_port_rssi = os.getenv("ACTIVE_PORT_RSSI", "")  
    cross_port_rssi = os.getenv("CROSS_PORT_RSSI", "")
   

    # 4. 提取各端口基础参数（TX/IBIAS/温度）
    print(f"br1_p4 的 cmis 解析结果：\n{active_port_cmis}")
    active_port_params = extract_cmis_params(active_port_cmis)
    print(f"br1_p4 的 cmis 解析结果：\n{active_port_params}")
    cross_port_params = extract_cmis_params(cross_port_cmis)


    # 5. 解析各端口 RSSI 数据，覆盖 RX 值
    # 5.1 处理 active_port
    print(f"br1_p4 的 RSSI 解析结果：\n{active_port_rssi}")
    active_port_rssi_dict = parse_rssi_data(active_port_rssi)
    print(f"br1_p4 的 RSSI 解析结果：\n{active_port_rssi_dict}")
    for cmis_num in range(1, 9):
        for ch_idx in range(8):
            ch_num = ch_idx + 1
            rx_key = f"cmis{cmis_num}_ch{ch_num}_RX"
            active_port_params[rx_key] = active_port_rssi_dict[cmis_num][ch_idx]
    
    # 5.2 处理 cross_port
    print(f"cross_port 的 RSSI 解析结果：\n{cross_port_rssi}")
    cross_port_rssi_dict = parse_rssi_data(cross_port_rssi)
    print(f"br1_p6 的 RSSI 解析结果：\n{cross_port_rssi_dict}")
    for cmis_num in range(1, 9):
        for ch_idx in range(8):
            ch_num = ch_idx + 1
            rx_key = f"cmis{cmis_num}_ch{ch_num}_RX"
            cross_port_params[rx_key] = cross_port_rssi_dict[cmis_num][ch_idx]
    
    
    # 6. 组装数据行（原逻辑保留）
    rows = []
    ports = [
        (active_port, active_port_params),
        #(cross_port, cross_port_params)
    ]
    new_header = ["ACTIVE_PORT", "OCS_TOPO", "CROSS_PORT", "OCS", "lane", "cross_temp", "cross_RX", "cross_TX", "cross_ibias", "avtive_temp", "avtive_RX", "avtive_TX", "avtive_ibias"]


    for cmis_num in range(1, 9):
        cross_temp_val = cross_port_params.get(f"cmis{cmis_num}_temp")
        active_temp_val = active_port_params.get(f"cmis{cmis_num}_temp")
        for ch_num in range(1, 9):
            rx_key = f"cmis{cmis_num}_ch{ch_num}_RX"
            tx_key = f"cmis{cmis_num}_ch{ch_num}_TX"
            ibias_key = f"cmis{cmis_num}_ch{ch_num}_laser_ibias"
            
            cross_rx_val = cross_port_params.get(rx_key)
            cross_tx_val = cross_port_params.get(tx_key)
            cross_ibias_val = cross_port_params.get(ibias_key)
            
            active_rx_val = active_port_params.get(rx_key)
            active_tx_val = active_port_params.get(tx_key)
            active_ibias_val = active_port_params.get(ibias_key)
                
            if cross_rx_val or cross_tx_val or cross_ibias_val:
                row = {                        
                        "ACTIVE_PORT": active_port,
                        "OCS_TOPO": loop,                       
                        "CROSS_PORT": cross_port,
                        "OCS": cmis_num,
                        "lane": ch_num,
                        "cross_temp": cross_temp_val,
                        "cross_RX": cross_rx_val,
                        "cross_TX": cross_tx_val,
                        "cross_ibias": cross_ibias_val,
                        "avtive_temp": active_temp_val,
                        "avtive_RX": active_rx_val,
                        "avtive_TX": active_tx_val,
                        "avtive_ibias": active_ibias_val
                    }
                rows.append(row)

    # 7. 写入 CSV（原逻辑保留）
    if rows:
        df = pd.DataFrame(rows, columns=new_header)
        write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0
        df.to_csv(
            csv_file,
            mode="a",
            header=write_header,
            index=False,
            encoding="utf-8"
        )
        print(f"数据已写入 {csv_file}（active_port:{active_port}）")
    else:
        print(f"未提取到任何 CMIS 通道数据（active_port:{active_port},）")


if __name__ == "__main__":
    main()                                                                                                                                                    to_yd/64_xz-dark_current_filter.sh                                                                  0000750 0000000 0000000 00000017447 15120511737 016243  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=1

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}


collect_onoc_cross() {
    local active_bmc="$1"
    local active_bmcip="${!active_bmc}"  
    local _active_node="${active_bmc%_bmc}"
    local active_port=$2
    local cross_bmc=$3
    local cross_bmcip="${!cross_bmc}"  
    local _cross_node="${cross_bmc%_bmc}"
    local cross_port=$4
    local active_host="https://$active_bmcip"
    local cross_host="https://$cross_bmcip"

    local active_node=$(convert_node $_active_node $star_node)
    local cross_node=$(convert_node $_cross_node $star_node)
  
    echo -e "\n============= active_port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            #local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0x --ocs $ocs"
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done  
    #disable ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    echo -e "\nwait 10s, ocs stable..."
    sleep 10
    
    echo -e "\n----- collect active_port: $active_port data -----"
    local active_port_cmis=$(ocsdiag -i $active_bmcip -e $active_port -c vcmd -p "cmis mon" 2>&1)
    local cross_port_cmis=$(ocsdiag -i $cross_bmcip -e $cross_port -c vcmd -p "cmis mon" 2>&1)
    [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
    [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $cross_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"    
    export CSV_FILE="$csv_file"
    export LOOP="$i"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$cross_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset CSV_FILE LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc active_port: $active_port 数据处理完成 ===============\n"
    sleep 2  
}
LOG_FILE="logs/summary_logs/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).csv"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

debug_mode=1

for ((i=1; i<=$loop_times; i++)); do
    echo -e "\n======================================================"
    echo "===================== 第 $i 轮循环 ====================="
    echo "======================================================"

    #port 8
    collect_onoc_cross node1_bmc 8 node2_bmc 8    
    # collect_onoc_cross node2_bmc 8 node1_bmc 8
    collect_onoc_cross node3_bmc 8 node4_bmc 8
    # collect_onoc_cross node4_bmc 8 node3_bmc 8
    #port4
    collect_onoc_cross node1_bmc 4 node3_bmc 4    
    # collect_onoc_cross node3_bmc 4 node1_bmc 4
    collect_onoc_cross node2_bmc 4 node4_bmc 4
    # collect_onoc_cross node4_bmc 4 node2_bmc 4
    #port6
    collect_onoc_cross node1_bmc 6 node4_bmc 6    
    # collect_onoc_cross node4_bmc 6 node1_bmc 6
    collect_onoc_cross node2_bmc 6 node3_bmc 6
    # collect_onoc_cross node3_bmc 6 node2_bmc 6
    #port 2
    collect_onoc_cross node1_bmc 2 node4_bmc 2    
    # collect_onoc_cross node4_bmc 6 node1_bmc 6
    collect_onoc_cross node2_bmc 2 node3_bmc 2
    # collect_onoc_cross node3_bmc 6 node2_bmc 6

    
done
echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                                                                                         to_yd/37_enable_ocs.sh                                                                              0000750 0000000 0000000 00000027475 15120511740 013642  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
wait_time=10
export exp_tool="./smbus-tool/build/whiteriver_exp.exe"

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""
host1=""
host2=""
test_port=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --host)
            shift            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--host 后必须指定主机列表（如 --host 102,103）"
                exit 1
            fi
                       
            IFS=',' read -ra host_numbers <<< "$1" 
     
            for ((i=0; i<${#host_numbers[@]}; i++)); do
                case $i in
                    0)
                        host1="${host_numbers[0]}"
                        ;;
                    1)
                        host2="${host_numbers[1]}"
                        ;;
                    *)
                esac
            done            
            shift
            ;;
		--port)
            shift            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--port 后必须指定节点编号（如 --port 4）"
                exit 1
            fi
            test_port="$1"
            shift
            ;;
        *)
            echo "错误：未知参数 $1"
            echo "用法: $0 [--node NODE_ID] [--host HOST1,HOST2,...]"
            exit 1
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

first_node="$target_star_node"
second_node="$((target_star_node + 4))"
# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
node5_ip=$(jq -r ".\"$second_node\".node1_ip" "$config_file")
node5_bmc=$(jq -r ".\"$second_node\".node1_bmc" "$config_file")
node6_ip=$(jq -r ".\"$second_node\".node2_ip" "$config_file")
node6_bmc=$(jq -r ".\"$second_node\".node2_bmc" "$config_file")
node7_ip=$(jq -r ".\"$second_node\".node3_ip" "$config_file")
node7_bmc=$(jq -r ".\"$second_node\".node3_bmc" "$config_file")
node8_ip=$(jq -r ".\"$second_node\".node4_ip" "$config_file")
node8_bmc=$(jq -r ".\"$second_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)
bmc_host5=$(echo "$node5_bmc" | cut -d '.' -f 4)
bmc_host6=$(echo "$node6_bmc" | cut -d '.' -f 4)
bmc_host7=$(echo "$node7_bmc" | cut -d '.' -f 4)
bmc_host8=$(echo "$node8_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}

collect_onet_cross() {
    local active_bmc=$1
	local cross_bmc=$2
    local active_port=$3
	local ocs_topo=$4
    local active_host="https://$active_bmc"
    local cross_host="https://$cross_bmc"
    local active_node=$(echo "$active_bmc" | cut -d '.' -f 4)
    local cross_node=$(echo "$cross_bmc" | cut -d '.' -f 4)

    echo -e "\n============= active_port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=6; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
			echo $cmd
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done  

    #disable ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=6; m++)); do
            local cmd="$exp_tool --host $cross_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    echo -e "\nsleep 10s, waiting ocs stable..."
    sleep 10
    
    echo -e "\n----- collect $active_bmc: p$active_port cmis data -----"
    for ((j=1; j<=6; j++)); do
      local active_port_cmis=$(ocsdiag -i $active_bmc -e $active_port -c vcmd -p "cmis mon" 2>&1)    
      [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
      
      if ! echo "$active_port_cmis" | grep -q "AsyncPoll failed|lock: Failed"; then
        success=true
        break
      fi 
      sleep 3
    done
    
    echo -e "\n----- collect $cross_bmc: p$active_port cmis data -----"
    for ((k=1; k<=6; k++)); do
      local cross_port_cmis=$(ocsdiag -i $cross_bmc -e $active_port -c vcmd -p "cmis mon" 2>&1)
      [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
      if ! echo "$cross_port_cmis" | grep -qE "AsyncPoll failed|lock: Failed"; then      
        success=true
        break
      fi 
      sleep 3
    done    
    
    echo -e "\n----- collect p$active_port RSSI data -----"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $active_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"
    export LOOP="$ocs_topo"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$active_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc and $cross_bmc: p$active_port 数据处理完成 ===============\n"
    sleep 6
    echo -e "\n============= disable active port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=6; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
			echo $cmd
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done 
}

export -f collect_onet_cross

link_crosstalk () {
    local active_bmc=$1
    local cross_bmc=$2	
	local active_port=$3
    local active_node=$(echo "$active_bmc" | cut -d '.' -f 4)
    local cross_node=$(echo "$cross_bmc" | cut -d '.' -f 4)

	###########onoc cross talk#############
	echo "==============test node:$active_node,$cross_node port:$active_port onoc cross talk==========="
	if [ "$active_port" -eq 4 ]; then
		local ocs_topo="onoc6"
	elif [ "$active_port" -eq 6 ]; then
		local ocs_topo="onoc7"
	elif [ "$active_port" -eq 8 ]; then
		local ocs_topo="onoc6"
	fi

	timeout 30s bash set_port_topo.sh $active_bmc $active_port $ocs_topo
	timeout 30s bash set_port_topo.sh $cross_bmc $active_port $ocs_topo
	timeout 120s bash -c 'collect_onet_cross "$@"' _ $active_bmc $cross_bmc $active_port $ocs_topo
	timeout 120s bash -c 'collect_onet_cross "$@"' _ $cross_bmc $active_bmc $active_port $ocs_topo

	
	###########onet cross talk#############
	echo "test node:$active_node,$cross_node port:$active_port onet cross talk"	
	local snode1=$(((active_node - bmc_host1) / 4))
    local snode2=$(((cross_node - bmc_host1) / 4)) 

	 if [ $snode1 -eq $snode2 ]; then
		echo "the same snode set topo-onetb"
		local ocs_topo="onetb"	
	else
		echo "not the same snode set topo-oneta"
		local ocs_topo="oneta"		
	fi
	
	if [ "$active_port" -ne 8 ]; then
		timeout 30s bash set_port_topo.sh $active_bmc $active_port $ocs_topo
		timeout 30s bash set_port_topo.sh $cross_bmc $active_port $ocs_topo
		timeout 120s bash -c 'collect_onet_cross "$@"' _ $active_bmc $cross_bmc $active_port $ocs_topo
		timeout 120s bash -c 'collect_onet_cross "$@"' _ $cross_bmc $active_bmc $active_port $ocs_topo
	fi	
}
LOG_FILE="logs/summary_logs/64_xz-crosstalk_ocs_rssi_$bmc_host1_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/64_xz-crosstalk_ocs_rssi_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).csv"
export CSV_FILE="$csv_file"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

export debug_mode=1

nodes=("$node1_bmc" "$node2_bmc" "$node3_bmc" "$node4_bmc" "$node5_bmc" "$node6_bmc" "$node7_bmc" "$node8_bmc" )
for node in "${nodes[@]}"; do
  for port in 2 4 6 8; do
    for ocs in {1..8}; do
        for ((m=1; m<=6; m++)); do
            reg_cmd="$exp_tool --host "https://$node" --port $port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$reg_cmd"
            output=$($reg_cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
  done
done

unset debug_mode collect_onet_cross exp_tool CSV_FILE

echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                                                                   to_yd/README                                                                                        0000750 0000000 0000000 00000003223 15120511737 011552  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   脚本名字包含zte的脚本已经在zte环境测试通过，脚本名字没有包含zte的脚本上海环境调试通过。

1. 53_zte-13141516-setup_4G_ONOC-linkup_recovery_CE.sh --并行setup 13141516 ONOC LINK-4G port 2468 为ONOC:"0403020108070605", 可通过loop_times=1 改loop次数
2. 52_zte-13141516-setup_4G-linkup_recovery_CE.sh -- 并行setup 13141516 EP32 TOPO LINK- port46 为ONETA，port28为ONOC:"0403020108070605", 可通过loop_times=1 改loop次数
3. 52_zte-13141516-reset_4G-GPU.sh  -- 并行13141516 RESET 4G GPU
4. 52_zte-5678-setup_4G-linkup_recovery_CE.sh -- 并行setup 5678 EP32 TOPO LINK- port46 为ONETA，port2为ONOC5，port8为ONOC6, 可通过loop_times=1 改loop次数
5. 52_zte-1234-setup_4G-linkup_recovery_CE.sh -- 并行setup 1234 EP32 TOPO LINK- port46 为ONETA，port2为ONOC5，port8为ONOC6, 可通过loop_times=1 改loop次数

6. 50_set_port_topo.sh bmc_ip port topo --配置port topo 脚本可配置参数执行
7. 50_zte_oneta-link_recovery_CE.sh svr1_ip svr1_bmc_ip svr2_ip svr2_bmc_ip port -- setup ONETA linkup 脚本， 不包含 配置topo和reset GPU步骤
8. 50_zte_onoc-link_recovery_CE.sh svr_ip svr_bmc_ip port -- setup ONOC linkup 脚本， 不包含 配置topo和reset GPU步骤
9. 50_zte_reset_br_gpu.sh svr_ip -- reset GPU 脚本

新脚本可以根据 6-9 脚本随机组合。



cross-talk脚本：
1. 64_zte_13-16-onoc_swa_ocs_rssi.sh， 62_zte_21-24-onoc_swa_ocs_rssi.sh
   直接root账户运行，从csc_data中获取数据相应数据
    
    collect_onoc_cross active_bmc active_port cross_bmc cross_port
    例如：
    collect_onoc_cross node1_bmc 8 node2_bmc 8
    根据光纤来配置对应的cross talk
                                                                                                                                                                                                                                                                                                                                                                             to_yd/reset_br_gpu.sh                                                                               0000750 0000000 0000000 00000001563 15120511737 013713  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

#configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
active_remote1=$1
gpu_count=8

cleanup_and_output() {    
    printf '{"success": %s, "reason": "%s"}\n' "$current_success" "$fail_reason"
}
trap cleanup_and_output EXIT

echo -e "\n------:RESET $active_remote1 GPU ------" >&2

for ((i=1; i<=3; i++)); do
    rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
    echo "reset $active_remote1 GPU:" >&2
    echo "$rest_gpu" >&2    
    success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
    if [ $success_count -eq $gpu_count ]; then
        echo -e "times$i: GPU RESET success" >&2
        break        
    else
        echo -e "times$i: GPU RESET fail" >&2
        
    fi
    sleep 60
done
                                                                                                                                             to_yd/68_xz_BL-32_reset-gpu_linkup_oneta.sh                                                         0000750 0000000 0000000 00000036064 15120511737 017557  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# 基础配置（非节点相关）
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

# 功能参数标识
reset_gpu_flag=false
set_topo_flag=false
run_all=true 

# 节点配置相关变量
config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

#################################参数解析#################################
while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            # 校验--node参数有效性
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
        --test)
            shift
            run_all=false
            # 处理--test后多个参数
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                case "$1" in
                    reset_gpu)
                        reset_gpu_flag=true
                        ;;
                    set_topo)
                        set_topo_flag=true
                        ;;
                    *)
                        echo "错误：不支持的测试参数 '$1'，仅支持 reset_gpu 或 set_topo"
                        exit 1
                        ;;
                esac
                shift
            done
            ;;
        *)
            echo "错误：未知参数 '$1'，支持的参数：--node <编号> --test [reset_gpu|set_topo...]"
            exit 1
            ;;
    esac
done

#################################加载JSON配置#################################
# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

LOG_FILE="logs/summary_logs/stress-4-6-8_links_onetb_recovery_CMIS-mon_$(date +%Y%m%d_%H%M%S).log"
# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1

#################################RESETGPU#########################
reset_gpu() {  
    active_remote1=$1
    gpu_count=8
    echo -e "\n------:RESET $active_remote1 GPU ------" >&2
    for ((i=1; i<=3; i++)); do
        rest_gpu=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$active_remote1 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"brsmi reset -g\""' 2>&1)
        echo "reset $active_remote1 GPU:" >&2
        echo "$rest_gpu" >&2    
        success_count=$(echo "$rest_gpu" | grep -c "GPU[0-7] Successed.")
        if [ $success_count -eq $gpu_count ]; then
            echo -e "times$i: GPU RESET success" >&2
            break        
        else
            echo -e "times$i: GPU RESET fail" >&2            
        fi
        sleep 20
    done
}

############################disable_all_ltssm#########################
disable_all_ltssm() {   
    node_ip=$1    
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ltssm -d\""')   
   
    echo "node-$node_ip disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU port是否重置成功
    success_count=$(echo "$dis_localgpu" | grep -cE '^GPU [0-7], HW\[[0-9]+\]: disable ltssm \(2, 4, 6, 8, 10\) done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "GPU RESET success"
        return 0
    else
        echo -e "GPU RESET FAIL"
        return 2
    fi    
    sleep 1
}

disable_port_ltssm() {
    node_ip=$1
    node_port=$2
    gpu_count=8
    echo "sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""'"
    dis_gpuport=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./scripts/disable_port_ltssm '$node_port'\""')   
   
    echo "node-$node_ip port$node_port disable status:"
    echo "$dis_gpuport"
    # 检查local 所有GPU是否重置成功    
    success_count=$(echo "$dis_gpuport" | grep -c 'HW\[[0-7]\]: disable ltssm '$node_port' done$')  
    if [ $success_count -eq $gpu_count ]; then
        echo -e "disable port ltssm success"
        return 0
    else
        echo -e "disable port ltssm FAIL"
        return 2
    fi    
    sleep 1
}

#################################RESETEXP#########################
reset_exp() {   
    local host="$1"
    local port="$2"  
    echo $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    $exp_tool --host $host --port $port --cmd reset --param "clod pri" 2>&1
    sleep 3
    echo "chek EXP stat..."
    exp_state=$($exp_tool --host $host --port $port --cmd stat 2>&1)	
    echo "exp state:"
    echo "$exp_state" 
    tim_sw_time=$(echo "$exp_state" | grep "TIM_SW:" | sed -n 's/.*TIM_SW: \([^,]*\).*/\1/p')
    sec_part=$(echo "$tim_sw_time" | awk -F':' '{print $NF}')
    seconds_str=$(echo "$sec_part" | tr -d '\n' | grep -oE '[0-9]+' | head -n1)
    if [[ "$seconds_str" =~ ^[0-9]+$ ]]; then
        seconds=$((10#$seconds_str))
    else
        echo "秒数格式无效（提取值: [$seconds_str], 视为FAIL"        
        continue
    fi
    # 判断结果
    if [ "$seconds" -lt 5 ]; then        
        echo -e "EXP reset success"
        return 0
    else
        echo "TIM_SW: $seconds s, EXP reset FAIL" 
        current_success=false
        fail_reason="GPU EXP reset FAIL" 
        return 2
    fi
}

#################################load gpu topo#########################
load_gputopo() {
    node_ip=$1 
    gpu_topo=$2
    echo -e "\n----load-gpu-topo ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""'
       
    output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"./ocsTopo -s mesh_6p_32.json\""' 2>&1)
    sleep 1
    echo "load topo result:"
    echo "$output" 
    ## check all HWID port ready or not
    # hwid_count=8
    # link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    # if [ $link1_count -eq $hwid_count ]; then
    #     echo -e "RETRAIN LINK 1 success"
    #     return 0 
    # else
    #     echo "RETRAIN LINK 1 ready FAIL"
    #     return 0 
    # fi 
}

#################################LINKUP#########################
onet_linkup() {
    local node1_ip=$1
    local node1_port=$2
    local node2_ip=$3
    local node2_port=$4   

    echo -e "\n----ONET- LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""'
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
     
    sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node1_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node1_port'\""' &
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node2_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node2_port'\""'
      2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node2_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

onoc_linkup() {
    local node_ip=$1
    local node_port=$2

    echo -e "\n----ONOC-LINKUP ----"   
	#link 1 traning    
    echo sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'
            
    link1_output=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -p '$node_port'\""'  2>&1)
    sleep 2
    echo "link1 train:"
    echo "$link1_output" 
    ## check all HWID port ready or not
    hwid_count=8
    link1_count=$(echo "$link1_output" | grep -c "HWID [0-7] Port\[$node_port\]: Ready")    
    if [ $link1_count -eq $hwid_count ]; then
        echo -e "RETRAIN LINK 1 success"
        return 0 
    else
        echo "RETRAIN LINK 1 ready FAIL"
        return 0 
    fi 

    sleep 1
}

#################################GPUSPEED#########################
gpu_speed() {
    host=$1
    port=$2

    #----------- check all GPU port is GEN5X8 or not-------------
    echo -e "\n----ONETA- check all GPU port $port is GEN5X8 or not ----" 
    for ((m=1; m<=3; m++)); do       
        GEN5_pattern="Port $port: GEN: 5, Width: 8"
        expected_count=8        
        link1_full=$(sshpass -p 'PicT1!2@3#4$' ssh $remote_user@$node_ip 'echo RCms@Zte3 | su - root bash -c "docker exec mysccl-zds bash -c \"ocsTopo -c\""')
        link1_GEN=$(echo "$link1_full" | grep  "Port $port" 2>&1) 
        echo -e "link1:\n $link1_GEN"
        link1_count=$(echo "$link1_GEN" | grep -c "$GEN5_pattern" 2>&1)
        echo "link1 gen5 count: $link1_count"	
        if [ $link1_count -eq $expected_count ]; then
            echo -e "RETRAIN LINK 1 GEN5X8 success"
            current_success=true
            break
        else
            echo -e "RETRAIN LINK 1 GEN5X8 FAIL" 
            current_success=false
            fail_reason="RETRAIN LINK 1 GEN5X8 FAIL" 
        fi
        sleep 6
    done
}

# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

for ((i=1; i<=$loop_times; i++)); do  
    echo -e "\n============= 压测linkup 测试第$i 轮 ==============="
    echo    
    current_success=true
    fail_reason=""
        
    ###########set 4G TOPO#############################
    if [ "$set_topo_flag" = true ]; then
        echo -e "\n====== set 4G toop ======"
        echo
        bash set_port_topo.sh $node1_bmc 2 onoc5 &
        bash set_port_topo.sh $node1_bmc 4 oneta &
        bash set_port_topo.sh $node1_bmc 6 oneta &
        bash set_port_topo.sh $node1_bmc 8 onoc6 &
        bash set_port_topo.sh $node2_bmc 2 onoc5 &
        bash set_port_topo.sh $node2_bmc 4 oneta &
        bash set_port_topo.sh $node2_bmc 6 oneta &
        bash set_port_topo.sh $node2_bmc 8 onoc6 &
        bash set_port_topo.sh $node3_bmc 2 onoc5 &
        bash set_port_topo.sh $node3_bmc 4 oneta &
        bash set_port_topo.sh $node3_bmc 6 oneta &
        bash set_port_topo.sh $node3_bmc 8 onoc6 &
        bash set_port_topo.sh $node4_bmc 2 onoc5 &
        bash set_port_topo.sh $node4_bmc 4 oneta &
        bash set_port_topo.sh $node4_bmc 6 oneta &
        bash set_port_topo.sh $node4_bmc 8 onoc6 &
        wait
        echo        
    fi

    #RESET 4G GPU
    echo -e "\n====== reset 4G GPU ======"    
    echo
    echo bash reset_br_gpu.sh $node1_ip
    bash reset_br_gpu.sh $node1_ip &
    bash reset_br_gpu.sh $node2_ip &
    bash reset_br_gpu.sh $node3_ip &
    bash reset_br_gpu.sh $node4_ip &
    wait
    sleep 3
    echo

    #RESETEXP
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----reset exp ----" 
    (cd smbus-tool && python3 scripts/exp_cold_reset.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8)
    sleep 1
    
    #OCS-SCREEN
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----ocs_screen before linkup----" 
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	

    #onet linkup
    if ! onet_linkup $node1_ip "4" $node3_ip "4"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node2_ip "4" $node4_ip "4"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi   
    sleep 2 
    if ! onet_linkup $node1_ip "6" $node4_ip "6"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi  
    sleep 2  
    if ! onet_linkup $node2_ip "6" $node3_ip "6"; then
        current_success=false
        fail_reason+=" port-$port linkup fail; "
    fi   
    sleep 2 

    #onoc linkup
    nodes=("$node1_ip" "$node2_ip" "$node3_ip" "$node4_ip") 
    ports=(2 8)   
    for node in "${nodes[@]}"; do 
        for port in "${ports[@]}"; do      
            if ! onoc_linkup $node $port; then
                current_success=false
                fail_reason+=" port-$port onoc linkup fail; "
                sleep 2
            fi 
        done       
    done

    #ocs-screen
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 10s ocs_screen ----"
    sleep 10
    (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	
    sleep 6
    # #SLEEP 60S ocs-screen
    # echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]----sleep 60s ocs_screen ----"
    # sleep 60
    # (cd smbus-tool && python3 scripts/ocs_screen.py --host $bmc_host1,$bmc_host2,$bmc_host3,$bmc_host4 --port 2,4,6,8 --skip error)	
    
    ####result###        
    echo -e "\n[$(date +"%Y-%m-%d %H:%M:%S")]---- 本轮测试结果 ----"
    if [ "$current_success" = true ]; then
        echo -e "\n第$i轮测试成功"
    else
        echo -e "\n第$i轮测试FAIL: $fail_reason"
    fi
    
done

# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                                                                                                                                                                                                                                                                                                                                                                            to_yd/64_xz-onet_swa-crosstalk-b_ocs_rssi.sh                                                        0000750 0000000 0000000 00000024604 15120511737 020151  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

# configure
root_user="root"
root_psswd="RCms@Zte3"
remote_user="pict"
remote_psswd="PicT1!2@3#4$"
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
wait_time=10
loop_times=100

reset_gpu_flag=false
set_topo_flag=false
run_all=true 

config_file="node_configs.json"  
target_star_node=""
node1_ip=""
node1_bmc=""
node2_ip=""
node2_bmc=""
node3_ip=""
node3_bmc=""
node4_ip=""
node4_bmc=""
star_node=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --node)
            shift
            
            if [[ -z "$1" || "$1" =~ ^-- ]]; then
                echo "错误：--node 后必须指定节点编号（如 --node 21）"
                exit 1
            fi
            target_star_node="$1"
            shift
            ;;
    esac
done

# 校验依赖和配置文件
if ! command -v jq &> /dev/null; then
    echo "错误：未安装 jq 工具，请先执行 'yum install jq -y' 或 'apt install jq -y' 安装"
    exit 1
fi
if [[ -z "$target_star_node" ]]; then
    echo "错误：必须通过 --node 指定节点编号（如 --node 21）"
    exit 1
fi
if [[ ! -f "$config_file" ]]; then
    echo "错误：配置文件 $config_file 不存在，请检查路径"
    exit 1
fi

first_node="$target_star_node"
second_node="$((target_star_node + 4))"
# 从JSON读取配置（jq解析）
node1_ip=$(jq -r ".\"$target_star_node\".node1_ip" "$config_file")
node1_bmc=$(jq -r ".\"$target_star_node\".node1_bmc" "$config_file")
node2_ip=$(jq -r ".\"$target_star_node\".node2_ip" "$config_file")
node2_bmc=$(jq -r ".\"$target_star_node\".node2_bmc" "$config_file")
node3_ip=$(jq -r ".\"$target_star_node\".node3_ip" "$config_file")
node3_bmc=$(jq -r ".\"$target_star_node\".node3_bmc" "$config_file")
node4_ip=$(jq -r ".\"$target_star_node\".node4_ip" "$config_file")
node4_bmc=$(jq -r ".\"$target_star_node\".node4_bmc" "$config_file")
node5_ip=$(jq -r ".\"$second_node\".node1_ip" "$config_file")
node5_bmc=$(jq -r ".\"$second_node\".node1_bmc" "$config_file")
node6_ip=$(jq -r ".\"$second_node\".node2_ip" "$config_file")
node6_bmc=$(jq -r ".\"$second_node\".node2_bmc" "$config_file")
node7_ip=$(jq -r ".\"$second_node\".node3_ip" "$config_file")
node7_bmc=$(jq -r ".\"$second_node\".node3_bmc" "$config_file")
node8_ip=$(jq -r ".\"$second_node\".node4_ip" "$config_file")
node8_bmc=$(jq -r ".\"$second_node\".node4_bmc" "$config_file")
star_node="$target_star_node"

bmc_host1=$(echo "$node1_bmc" | cut -d '.' -f 4)
bmc_host2=$(echo "$node2_bmc" | cut -d '.' -f 4)
bmc_host3=$(echo "$node3_bmc" | cut -d '.' -f 4)
bmc_host4=$(echo "$node4_bmc" | cut -d '.' -f 4)
bmc_host5=$(echo "$node5_bmc" | cut -d '.' -f 4)
bmc_host6=$(echo "$node6_bmc" | cut -d '.' -f 4)
bmc_host7=$(echo "$node7_bmc" | cut -d '.' -f 4)
bmc_host8=$(echo "$node8_bmc" | cut -d '.' -f 4)


# 校验配置完整性
if [[ "$node1_ip" == "null" || -z "$node1_ip" ]]; then
    echo "错误：配置文件中未找到 star_node=$target_star_node 的有效配置"
    exit 1
fi

convert_node() {
    local input_node="$1"
    local start_node="$2"
    local node_suffix
    local target_node_num
    local target_node    
    node_suffix=$(echo "$input_node" | sed -nE 's/^node([0-9]+)$/\1/p')

    if [[ -z "$node_suffix" ]]; then
        echo "错误：输入 node 格式无效，需为 'node+数字'（如 node1、node2）" >&2
        return 1
    fi
    if ! [[ "$start_node" =~ ^[0-9]+$ ]]; then
        echo "错误：起始偏移量需为正整数" >&2
        return 1
    fi
    
    target_node_num=$((node_suffix + start_node - 1))
    target_node="node$target_node_num"
    echo "$target_node"
    return 0
}


collect_onet_cross() {
    local active_bmc="$1"
    local active_bmcip="${!active_bmc}"  
    local _active_node="${active_bmc%_bmc}"
    local active_port=$2
    local cross_bmc=$3
    local cross_bmcip="${!cross_bmc}"  
    local _cross_node="${cross_bmc%_bmc}"
    local cross_port=$4
    local active_host="https://$active_bmcip"
    local cross_host="https://$cross_bmcip"

    local active_node=$(convert_node $_active_node $star_node)
    local cross_node=$(convert_node $_cross_node $star_node)
  
    echo -e "\n============= active_port: $active_port ==============="
    #activate_ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $active_host --port $active_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done  
    #disable ocs    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0xff --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done
    
    echo -e "\nwait 10s, ocs stable..."
    sleep 10
    
    echo -e "\n----- collect active_port: $active_port data -----"
    local active_port_cmis=$(ocsdiag -i $active_bmcip -e $active_port -c vcmd -p "cmis mon" 2>&1)
    local cross_port_cmis=$(ocsdiag -i $cross_bmcip -e $cross_port -c vcmd -p "cmis mon" 2>&1)
    [ $debug_mode -eq 1 ] && echo "active_port_cmis $active_host: $active_port_cmis"
    [ $debug_mode -eq 1 ] && echo "cross_port_cmis $cross_host: $cross_port_cmis"
    sleep 2
    local cross_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $cross_host --port $cross_port 2>&1)
    local active_port_rssi=$(cd smbus-tool && python3 scripts/read_rssi.py --host $active_host --port $active_port 2>&1)
            
    # 调试输出（按需开启）
    [ $debug_mode -eq 1 ] && echo "cross_port_rssi $cross_host: $cross_port_rssi"
    [ $debug_mode -eq 1 ] && echo "active_port_rssi $active_host: $active_port_rssi"
        
    echo -e "\n---- 调用 Python 分析 $active_port 数据 ----"    
    export CSV_FILE="$csv_file"
    export LOOP="$i"
    export ACTIVE_PORT="$active_node-P$active_port" 
    export CROSS_PORT="$cross_node-P$cross_port"
    export ACTIVE_PORT_RSSI="$active_port_rssi" ACTIVE_PORT_CMIS="$active_port_cmis" 
    export CROSS_PORT_RSSI="$cross_port_rssi" CROSS_PORT_CMIS="$cross_port_cmis"

    python3 collect_test_data_onoc_rssi.py 
    
    unset CSV_FILE LOOP ACTIVE_BMC ACTIVE_PORT CROSS_BMC CROSS_PORT
    unset ACTIVE_PORT_RSSI ACTIVE_PORT_CMIS CROSS_PORT_RSSI CROSS_PORT_CMIS    

    echo -e "\n=============$active_bmc active_port: $active_port 数据处理完成 ===============\n"
    sleep 2
	 echo -e "\n=============$active cross_port ===============\n"
    #active cross_port    
    for ocs in {1..8}; do
        for ((m=1; m<=3; m++)); do
            local cmd="$exp_tool --host $cross_host --port $cross_port --cmd wb-ocs --reg 0x100082 --value 0 --ocs $ocs"
            [ $debug_mode -eq 1 ] && echo "执行命令：$cmd"
            local output=$($cmd 2>&1)
            echo $output
            if ! echo "$output" | grep -q "Locked"; then
                success=true
                break
            fi 
            sleep 3
        done
    done	
}
LOG_FILE="logs/summary_logs/62_stress_13-16-onoc_check_ocspower_$(date +%Y%m%d_%H%M%S).log"
csv_file="csv_data/64_xz-onet_swa-crosstalk-b_ocs_rssi_$bmc_host1-$bmc_host8_$(date +%Y%m%d_%H%M%S).csv"

# 同时输出到终端和日志文件
exec > >(tee -a "$LOG_FILE") 2>&1
# 记录开始时间
echo "=== 测试开始于: $(date) ==="
echo "=== 日志文件: $LOG_FILE ==="
echo "=========================="
echo

debug_mode=1
echo -e "\n====== set spnode1  topo ======"
echo
#bash set_port_topo.sh $node1_bmc 2 onoc5 &
bash set_port_topo.sh $node1_bmc 4 oneta &
bash set_port_topo.sh $node1_bmc 6 oneta &
bash set_port_topo.sh $node1_bmc 8 onoc6 &
#bash set_port_topo.sh $node2_bmc 2 onoc5 &
bash set_port_topo.sh $node2_bmc 4 oneta &
bash set_port_topo.sh $node2_bmc 6 oneta &
bash set_port_topo.sh $node2_bmc 8 onoc6 &
#bash set_port_topo.sh $node3_bmc 2 onoc5 &
bash set_port_topo.sh $node3_bmc 4 oneta &
bash set_port_topo.sh $node3_bmc 6 oneta &
bash set_port_topo.sh $node3_bmc 8 onoc6 &
#bash set_port_topo.sh $node4_bmc 2 onoc5 &
bash set_port_topo.sh $node4_bmc 4 oneta &
bash set_port_topo.sh $node4_bmc 6 oneta &
bash set_port_topo.sh $node4_bmc 8 onoc6 &
echo
echo -e "\n====== set spnode2  topo ======"
echo
#bash set_port_topo.sh $node5_bmc 2 onoc5 &
bash set_port_topo.sh $node5_bmc 4 oneta &
bash set_port_topo.sh $node5_bmc 6 oneta &
bash set_port_topo.sh $node5_bmc 8 onoc6 &
#bash set_port_topo.sh $node6_bmc 2 onoc5 &
bash set_port_topo.sh $node6_bmc 4 oneta &
bash set_port_topo.sh $node6_bmc 6 oneta &
bash set_port_topo.sh $node6_bmc 8 onoc6 &
#bash set_port_topo.sh $node7_bmc 2 onoc5 &
bash set_port_topo.sh $node7_bmc 4 oneta &
bash set_port_topo.sh $node7_bmc 6 oneta &
bash set_port_topo.sh $node7_bmc 8 onoc6 &
#bash set_port_topo.sh $node8_bmc 2 onoc5 &
bash set_port_topo.sh $node8_bmc 4 oneta &
bash set_port_topo.sh $node8_bmc 6 oneta &
bash set_port_topo.sh $node8_bmc 8 onoc6 &
wait
echo  

for ((i=1; i<=$loop_times; i++)); do
    echo -e "\n======================================================"
    echo "===================== 第 $i 轮循环 ====================="
    echo "======================================================"

    #spnode1>2
    collect_onet_cross node6_bmc 4 node4_bmc 4
	collect_onet_cross node4_bmc 4 node6_bmc 4
    collect_onet_cross node6_bmc 6 node3_bmc 6
	collect_onet_cross node3_bmc 6 node6_bmc 6
    collect_onet_cross node5_bmc 4 node3_bmc 4
	collect_onet_cross node3_bmc 4 node5_bmc 4
    collect_onet_cross node5_bmc 6 node4_bmc 6
	collect_onet_cross node4_bmc 6 node5_bmc 6
    #spnode2>1
    collect_onet_cross node2_bmc 4 node8_bmc 4
	collect_onet_cross node8_bmc 4 node2_bmc 4
    collect_onet_cross node2_bmc 6 node7_bmc 6
	collect_onet_cross node7_bmc 6 node2_bmc 6
    collect_onet_cross node1_bmc 4 node7_bmc 4
	collect_onet_cross node7_bmc 4 node1_bmc 4
    collect_onet_cross node1_bmc 6 node8_bmc 6
	collect_onet_cross node8_bmc 6 node1_bmc 6
done
echo -e "\n所有循环执行完成,数据已保存至:$csv_file"
chmod -R 755 csv_data/
# 记录结束时间
echo -e "\n=========================="
echo "------ 测试结束于: $(date) ------"
echo "------ 完整结果已保存到日志文件: $LOG_FILE -----"                                                                                                                            to_yd/set_port_topo.sh                                                                              0000750 0000000 0000000 00000003124 15120511737 014126  0                                                                                                    ustar   root                            root                                                                                                                                                                                                                   #!/bin/bash

#configure
exp_tool="./smbus-tool/build/whiteriver_exp.exe"
bmc_ip=$1
port=$2
currnent_set=$3
bmc_host="https://$bmc_ip"
echo $currnent_set
if [ $currnent_set = "onoc1" ]; then
    port_topo="0201040306050807"
elif [ $currnent_set = "onoc2" ]; then
    port_topo="0403020108070605"
elif [ $currnent_set = "onoc3" ]; then
    port_topo="0807060504030201"
elif [ $currnent_set = "onoc4" ]; then
    port_topo="0304010207080506"
elif [ $currnent_set = "onoc5" ]; then
    port_topo="0605080702010403"
elif [ $currnent_set = "onoc6" ]; then
    port_topo="0708050603040102"
elif [ $currnent_set = "onoc7" ]; then
    port_topo="0506070801020304"
elif [ $currnent_set = "oneta" ]; then
    port_topo="1111111111111111"
else
    port_topo="1212121212121212"
fi

for ((i=1; i<=16; i++)); do
    echo "$exp_tool --host $bmc_host --port $port --cmd vcmd --param "fl786 route $port_topo""
    topo_result=$($exp_tool --host $bmc_host --port $port --cmd vcmd --param "fl786 route $port_topo" 2>&1)
    echo "$topo_result"

    if [[ "$topo_result" =~ "FL786_1::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_2::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_3::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_4::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_5::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_6::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_7::SetRoute" ]] && \
    [[ "$topo_result" =~ "FL786_8::SetRoute" ]]; then
        echo -e "times$i: OCS topo set success"
        break
    else
        echo -e "times$i: OCS topo set fail"
        
    fi  
    sleep 10               
done
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            