SSHPASS或者rsync远程自动连接服务器并且在docker中跑脚本

news2024/11/21 0:20:42

背景:
一段脚本,需要在不同服务器上去跑,每次手动连接太麻烦,所以考虑用sshpas和sync来。
可以在脚本中配置多台服务器,然后自动去跑脚本。
配置文件

配置文件如下:
脚本主要通过[xxx]中的内容来解析脚本,所以不要重复里面的内容


# cant connect
[L20]
domain      = private_name
arch        = gpu
port        = 22
ip          = 1.0.0.1
password    = 123456
user        = root
device_id   = 2


[V100S_PCIe]
domain      = private_name
arch        = gpu
port        = 22
ip          = 10.10.10.10
password    = 123456
user        = root
device_id   = 0

详细的脚本具体如下,主要内容:
1)解析上面的脚本,分别存放在变量中
2)远程连接server
3) 同步文件
4)执行脚本
5)将结果同步回来

#!/bin/bash

# very important, otherwise the script will not work
histchars=

usage="Usage: $0 [Options]
Options:
    -f                 Forcibly delete container
exp:
    1. $0 -f
"

while getopts ':hf' opt; do
    case "$opt" in
    f)
        FORCE_DELETE_DOCKER=true
        ;;
    ? | h)
        echo "$usage"
        exit 1
        ;;
    esac
done


# Get DEBUG from environment, default to 0 if not set
LOG_LEVEL=${DEBUG:-0}

# Constants Variables
INVALID="invalid"
VALID="valid"
ACTIVE="active"
INACTIVE="inactive"
CUDA="cuda"
TOPS="tops"
RETRY_TIMES=5

# Error code
ERROR_SUCCESS=0
ERROR_CFG_PARAM_INVALID_EMPTY=1
ERROR_CFG_PARAM_INVALID_IPADDR=2
ERROR_CFG_NOT_FOUND=3
ERROR_CURRENT_DEVICE_IS_BUSY=4
ERROR_REMOTE_SSH_COMMAND_FAILED=5
ERROR_REMOTE_SSH_RSYN_FAILED=6
ERROR_CFG_PARAM_INVALID_DEVICE_ID=7
ERROR_CFG_PARAM_INVALID_DEVICE_NAME=8


# log for debug
WARN() {
    local RED='\033[0;31m'
    local NC='\033[0m' # No Color
    local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
    echo -e "${RED}[WARN ] ${TIMESTAMP}: $1${NC}"
}

DEBUG() {
    if [ "$LOG_LEVEL" -ge 2 ]; then
        local GREEN='\033[0;32m'
        local NC='\033[0m' # No Color
        local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
        echo -e "${GREEN}[DEBUG] ${TIMESTAMP}: $1${NC}"
    fi
}

INFO() {
    if [ "$LOG_LEVEL" -ge 1 ]; then
        local GREEN='\033[0;32m'
        local NC='\033[0m' # No Color
        local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
        echo -e "${GREEN}[INFO ] ${TIMESTAMP}: $1${NC}"
    fi
}

TRACE() {
    local GREEN='\033[0;32m'
    local NC='\033[0m' # No Color
    local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
    echo -e "${GREEN}[TRACE] ${TIMESTAMP}: $1${NC}"
}

function error_msg() {
    case $1 in
    $ERROR_SUCCESS)
        echo "Success"
        ;;
    $ERROR_CFG_PARAM_INVALID_EMPTY)
        echo "Invalid config params empty"
        ;;
    $ERROR_CFG_PARAM_INVALID_IPADDR)
        echo "Invalid config params ip address"
        ;;
    $ERROR_CFG_NOT_FOUND)
        echo "Config file not found"
        ;;
    $ERROR_CURRENT_DEVICE_IS_BUSY)
        echo "Current device is busy"
        ;;
    $ERROR_REMOTE_SSH_COMMAND_FAILED)
        echo "Remote ssh command failed"
        ;;
    $ERROR_REMOTE_SSH_RSYN_FAILED)
        echo "Remote rsync command failed"
        ;;
    $ERROR_CFG_PARAM_INVALID_DEVICE_ID)
        echo "Invalid config params device id"
        ;;
    $ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
        echo "Invalid config params device id"
        ;;
    *)
        echo "Unknown error"
        ;;
    esac
}

# Global variables
CURRENT_PATH=$(dirname "$(realpath "$0")")
LOCAL_PATH=$(find "$CURRENT_PATH" -type d -name "src" -exec dirname {} \; | head -n 1)
CFG_FILE="remote_cfg_template.txt"
EXCLUDE_FILE=".rsync_exclude_file"
ARCH="gpu"
SUDO="sudo"
IP="10.9.113.22"
PORT="22"
PASSWORD="123456"
USER="root"
CHIPBENCH_DOCKER_NAME="chipbenchmark.gpu"
LOG_NAME="REPORT"
DEVICE_ID="0"
REMOTE_PATH="/root"
DOMAIN="remote"

DEBUG "DEBUG: $LOG_LEVEL"

function string_trim()
{
    echo "$1" | sed 's/^[[:space:]]*\(.*[^[:space:]]\)\([[:space:]]*\)$/\1/g'
}

function get_region() {
    local cfg_file=$1
    local user_id=$2
    local ele_num=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1  "\\[$user_id\\]" | awk '{print $1}' | wc -l)
    local lines=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1  "\\[$user_id\\]" | awk '{print $1}' | xargs)
    if [ $ele_num -eq 1 ]; then
        last_line=$(wc -l < "$cfg_file")
        echo "$lines" "$last_line"
    else
        echo "$lines"
    fi
}

function get_config() {
    local cfg_file_original=$1
    local user_id=$2
    local cfg_name=$3
    local random_str=$(date +%s%N)
    local cfg_file=".tmp_cfg_file_${random_str}"
    cat $cfg_file_original | grep -vE '^#|^$' > $cfg_file
    local region=$(get_region $cfg_file $user_id)
    local start_line=$(echo $region | awk '{print $1}')
    local end_line=$(echo $region | awk '{print $2}')
    string_trim $(sed -n "${start_line}, ${end_line} s/\(${cfg_name}.*=.*\)/\1/p" $cfg_file | awk -F= '{print $2}')
    rm -rf $cfg_file
}

function get_cfg_id_list() {
    local cfg_file=$1
    local num_list=$(cat ${cfg_file} |grep -vE '^#|^$' | grep "\\[.*\\]" | grep -oP '(?<=\[).+?(?=\])'| xargs)
    echo $num_list
}

function parse_config_file(){
    DOMAIN=$(get_config remote_cfg_template.txt $1 domain)
    PORT=$(get_config remote_cfg_template.txt $1 port)
    ARCH=$(get_config remote_cfg_template.txt $1 arch)
    IP=$(get_config remote_cfg_template.txt $1 ip)
    PASSWORD=$(get_config remote_cfg_template.txt $1 password)
    USER=$(get_config remote_cfg_template.txt $1 user)
    DEVICE_ID=$(get_config remote_cfg_template.txt $1 device_id)
}


function check_ipaddr_is_correct()
{
    local ret=$ERROR_SUCCESS
    # Check if the string contains letters
    if echo "$1" | grep -q '[a-zA-Z]'; then
        ret=$ERROR_CFG_PARAM_INVALID_IPADDR
    else
         echo $1|grep "^[0-9]\{1,3\}\.\([0-9]\{1,3\}\.\)\{2\}[0-9]\{1,3\}$" > /dev/null;
         if [ $? -ne 0 ];then
             ret=$ERROR_CFG_PARAM_INVALID_IPADDR
         fi
         ipaddr=$1
         a=`echo $ipaddr|awk -F . '{print $1}'`  #Separated by ".", fetch the values for each column
         b=`echo $ipaddr|awk -F . '{print $2}'`
         c=`echo $ipaddr|awk -F . '{print $3}'`
         d=`echo $ipaddr|awk -F . '{print $4}'`
         for num in $a $b $c $d
         do
             if [ $num -gt 255 ] || [ $num -lt 0 ]    #Each value must be between 0 and 255
             then
                 ret=$ERROR_CFG_PARAM_INVALID_IPADDR
             fi
        done
    fi
   echo $ret
}

function do_cmd() {
    if [ -n "$1" ]; then
        DEBUG "$1"
        eval "$1"
    else
        WARN "cmd str is null."
    fi
}

function do_cmd_silent() {
    if [ -n "$1" ]; then
        DEBUG "$1"
        eval "$1" > /dev/null 2>&1 #silent menas no output
    else
        WARN "cmd str is null."
    fi
}

function do_remote_cmd() {
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        DEBUG "$CMD"
        eval "$CMD"
    else
        WARN "cmd str is null."
    fi
}

function do_remote_cmd_silent() {
    local ret=$ERROR_SUCCESS
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        DEBUG "$CMD"
        eval "$CMD" > /dev/null 2>&1 #silent menas no output
        if [ $? -ne 0 ]; then
            ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
        fi
    else
        WARN "cmd str is null."
        ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
    fi
    echo $ret
}

function do_remote_cmd_with_return() {
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES}  -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        OUTPUT=$(eval "$CMD")
        echo "$OUTPUT"
    else
        WARN "cmd str is null."
    fi
}

# sync local/remote file to remote/local server
# e.g. do_sync_cmd dir1 dir2 , means sync dir1 to dir2 #3060 adduser suiyuan root. mkdir /home/chipbench/workspace
function do_sync_cmd() {
    if [ -n "$1" ]; then
        CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\"  --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\"  -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
        do_cmd "$CMD"
    else
        WARN "cmd str is null."
    fi
}

function do_sync_cmd_silent() {
    if [ -n "$1" ]; then
        CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\"  --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\"  -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
        do_cmd "$CMD" > /dev/null 2>&1 #silent menas no output
    else
        WARN "cmd str is null."
    fi
}

function command_is_exist() {
    if ! command -v ${1} &> /dev/null
    then
        WARN "${1} could not be found. Please install ${1}."
        WARN "For Ubuntu, you can install it using: sudo apt install ${1}"
        exit 1
    else
        msg=$(sshpass -V | head -n 1)
        DEBUG "$msg"
    fi
}


function check_device_id_is_valid() {
    if [ $1 -lt $2 ]; then
        echo ${VALID}
    else
        echo ${INVALID}
    fi 
}

function get_suffix_from_type() {
    if [ "$ARCH" == "gpu" ]; then
        echo ${CUDA}
    else
        echo ${TOPS}
    fi
}

function delete_file_if_exit(){
    if [ -f "$1" ]; then
        rm -rf $1
        DEBUG "rm -rf $1"
    fi  
}

function check_file_is_exist_or_exit() {
    if [ ! -f "$1" ]; then
        WARN "File $1 not found!"
        exit 1
    fi
}

function check_dir_is_exist_or_exit() {
    if [ ! -d "$1" ]; then
        WARN "Directory $1 not found!"
        exit 1
    else
        DEBUG "Directory $1 found!"
    fi
}

function check_dir_has_files_or_exit() {
    if [ -z "$(ls -A $1)" ]; then
        WARN "Directory[$1] is empty. Exiting..."
        exit 1
    else
        DEBUG "Directory[$1] is NOT empty. continue..."
    fi
}


function docker_is_active() {
    # Check if 1 is set
    if [ -z "$1" ]; then
        WARN "CHIPBENCH_DOCKER_NAME is not set."
        exit 1
    fi

    # Check if the Docker container is running 
    CMD="${SUDO} docker ps --filter \"name=$1\" --filter \"status=running\" | awk '{print \$NF}' | grep \"$1\" | wc -l"
    DEBUG "CMD: $CMD"
    S=$(do_remote_cmd_with_return $CMD)
    DEBUG "s: $S"
    NUM_ACTIVE=$(echo "$S" | wc -l)
    DEBUG "NUM_ACTIVE: $NUM_ACTIVE"
    if [ -z $S ]; then
        echo ${ACTIVE}
    else
        echo ${INACTIVE}
    fi
}

dump_fail_result_msg(){
    # $1 error message, $2 other information
    WARN "***********************************************************************************"
    WARN "* CFG CASE   : $3"
    WARN "* FAILED"
    WARN "* $1[$2]"  
    WARN "**********************************************************************************"
}

dump_success_result_msg(){
    TRACE "*********************************************************************************" 
    TRACE "* CFG CASE   : $1"
    TRACE "* SUCCESS"
    TRACE "* log saved to $LOG_NAME"
    TRACE "*********************************************************************************"
}

dump_config_msg(){
    INFO "================================="
    INFO "case name   : $1"
    INFO "arch        : $ARCH"
    INFO "domain      : $DOMAIN"
    INFO "remote IP   : $IP"
    INFO "remote PORT : $PORT"
    INFO "remote PW   : $PASSWORD"
    INFO "remote USER : $USER"
    INFO "device id   : $DEVICE_ID"
    INFO "log name    : $LOG_NAME"
    INFO "local path  : $LOCAL_PATH"
    INFO "current path: $CURRENT_PATH"
    INFO "remote_path : $REMOTE_PATH"
}

function check_cfg_param_is_empty() {
    local ret=$ERROR_SUCCESS
    if [ -z "$DOMAIN" ] || [ -z "$PORT" ] || [ -z "$ARCH" ] || [ -z "$IP" ] || [ -z "$USER" ] ||  [ -z "$DEVICE_ID" ] || [ -z "$REMOTE_PATH" ]; then
        ret=$ERROR_CFG_PARAM_INVALID_EMPTY
    fi
    echo $ret
}

SSH_ERR="ssh connect to host $IP port $PORT: Connection refused"

# 1.Check if the build directory has files, if not, exit
BUILD_DIR="${LOCAL_PATH}/src/build"
check_dir_is_exist_or_exit $BUILD_DIR
check_dir_has_files_or_exit $BUILD_DIR
# 2. Parse IP, PASSWORD, USER from cfg file
check_file_is_exist_or_exit "$CURRENT_PATH/$CFG_FILE"

# Create tmp log dir
LOD_TMP_DIR="build_case_log"
LOG_DIR="${LOCAL_PATH}/src/${LOD_TMP_DIR}"
do_cmd_silent "test -d "${LOG_DIR}" && rm -rf "$LOG_DIR""
do_cmd_silent "mkdir -p $LOG_DIR"
DEBUG "LOG_DIR: $LOG_DIR"

UINT_LIST=$(get_cfg_id_list $CURRENT_PATH/$CFG_FILE)
UINT_NUM=$(echo "$UINT_LIST" | wc -w)
# Initialize counters
SUCCESS_COUNT=0
FAIL_COUNT=0
DEBUG "UINT_NUM: $UINT_NUM:[${UINT_LIST}]"
# Start time
start_time=$(date +%s)
for i in $UINT_LIST; do
    unset DOMAIN ARCH PORT IP PASSWORD USER DEVICE_ID LOG_NAME 
    parse_config_file $i

    # Make sure the REMOTE_PATH path is in the user directory, especially for non-root users, or rsync will fail
    REMOTE_PATH="/tmp/${DOMAIN}/chipbench"
    LOG_NAME=${i}.log
    do_cmd_silent "test -f "${LOG_NAME}" && rm -f "$LOG_NAME""

    if [ ${USER} == "root" ]; then
        SUDO=""
    fi

    # must after remote_path
    dump_config_msg $i

    # Check if the necessary parameters are empty
    ret_code=$(check_cfg_param_is_empty)
    DEBUG "check_cfg_param_is_empty ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "some params are null" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # Check if the IP address is valid
    ret_code=$(check_ipaddr_is_correct $IP)
    DEBUG "check_ipaddr_is_correct ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "$IP" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 3. Check whether sshpass and rsync are installed
    command_is_exist sshpass
    command_is_exist rsync
    
    # 4. Check if REMOTE_PATH exists on the remote server, create it if it does not
    # do_remote_cmd "adduser ${USER} root"
    # Check if the user is not in the root group and add them to the root group if they are not
    # if ! id -nG "$USER" | grep -qw "root"; then
    #     DEBUG "User $USER is not in the root group. Adding to root group..."
    #     do_remote_cmd_silent "adduser ${USER} root"
    # else
    #     DEBUG "User $USER is already in the root group."
    # fi

    # Check ssh connection is ok
    ret_code=$(do_remote_cmd_silent "pwd" |tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "failed to connect to ${USER}:${IP}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # Check if the user is in the root group and add them to the root group
    if [ "$USER" != "root" ]; then
        ret_code=$(do_remote_cmd_silent "adduser ${USER} root" | tail -n 1)
        DEBUG "do_remote_cmd_silent ret_code: $ret_code"
        if [ $ret_code != "$ERROR_SUCCESS" ]; then
            err_str=$(error_msg $ret_code)
            dump_fail_result_msg "$err_str" "adduser ${USER} root" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    fi

    # delete remote path /tmp/${DOMAIN}/chipbench if it exists
    ret_code=$(do_remote_cmd_silent "test -d ${REMOTE_PATH}/ && rm -rf ${REMOTE_PATH}/")

    # create remote path /tmp/${DOMAIN}/chipbench
    ret_code=$(do_remote_cmd_silent "mkdir -p ${REMOTE_PATH}/" | tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "mkdir -p ${REMOTE_PATH}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # chown
    ret_code=$(do_remote_cmd_silent "chown -R $USER:$USER ${REMOTE_PATH}/" | tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "chown -R $USER:$USER ${REMOTE_PATH}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 5. Sync the current directory to the remote server
    DEBUG "sync current directory to remote server"
    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_sync_cmd "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/"
    else
        ret_code=$(do_sync_cmd_silent "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/")
    fi

    # 6. Build the docker container
    CHIPBENCH_DOCKER_NAME=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh name $ARCH $DOMAIN")
    DEBUG "Build docker container[$CHIPBENCH_DOCKER_NAME]..."
    if [ "$FORCE_DELETE_DOCKER" ]; then
        DEBUG "Force delete the docker container[$CHIPBENCH_DOCKER_NAME]..."
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh rm $ARCH $DOMAIN")
    fi

    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_remote_cmd "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN"
    else
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN")
    fi

    # 7.if container is already running, skip the restart
    mount_cmd="${SUDO} mount -t nfs -o ro -o vers=3 10.9.231.206:/ef_Infra/devtools /home/.devtools"
    ret_code=$(do_remote_cmd_silent "${mount_cmd}")
    DOCKER_ACTIVE=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh status $ARCH $DOMAIN")
    DEBUG "docker container [$CHIPBENCH_DOCKER_NAME] status: $DOCKER_ACTIVE"
    if [ "$DOCKER_ACTIVE" == ${ACTIVE} ]; then
        DEBUG "Docker container[$CHIPBENCH_DOCKER_NAME] is already running."
    else
        DEBUG "Start the docker container[$CHIPBENCH_DOCKER_NAME]..."
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh restart $ARCH $DOMAIN")
    fi

    # get the gpu name
    if [ "$ARCH" == "gpu" ];then
        DEVICE_NAME=$(do_remote_cmd_with_return "nvidia-smi --query-gpu name --format=noheader,csv -i ${DEVICE_ID}")
    else
        inquery_cmd="ppp| grep -E '^[| ].[$DEVICE_ID-$DEVICE_ID].[ ]'| grep -v 'C'"
        tmp_name=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NAME=$(echo "$tmp_name" | awk '{print $3}')
    fi

    # Check if the docker container env is correct
    if [ "$ARCH" == "gcu" ];then
        ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"topscc --version\\\"" | tail -n 1)
        if [ $ret_code != "$ERROR_SUCCESS" ]; then
            err_str=$(error_msg $ret_code)
            DEBUG "err_str: $err_str, try create env"
            WARN "${DEVICE_NAME}, ENV is not correct, try to create env, WAITTING..."
            random_str=$(date +%s%N)
            deb_path=/tmp/${random_str}
            do_remote_cmd "test ! -d ${deb_path} && ${SUDO} mkdir -p ${deb_path}"
            efgrab_cmd="source /home/.devtools/tools/env.sh && cd ${deb_path} && efgrab efml && efgrab topsplatform && chmod +x ./*.run && dpkg -i ./*.deb && ./*.run --no-auto-load -y && rm -rf ${deb_path}"
            if [ ${LOG_LEVEL} -ge 2 ]; then
                do_remote_cmd "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\""
            else
                ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\"")
            fi
            efml_so_path=$(do_remote_cmd_with_return "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"dpkg -L efml |grep  -F  libefml.so\\\"" | tail -n 1)
            efml_so_dir_path=$(dirname "$efml_so_path")
            DEBUG "efml_so_dir_path: $efml_so_dir_path"
            ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"cp -f ${efml_so_dir_path}/libefml.so* /usr/lib/\\\"")
        fi
    fi

    # 8. Run the test case pppp -L  | awk '/[0-9]/ {print $1}'|grep -v "-"|wc -l
    # 8.1 Check if the device id is valid
    DEBUG "Check if the device id is valid"
    if [ "$ARCH" == "gpu" ];then
        inquery_cmd=" nvidia-smi -L"
        DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NUM=$(echo "$DEVICE_INFO" | grep -v '^\s*$' | wc -l)
    else
        inquery_cmd="pppp -L"
        DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NUM=$(echo "$DEVICE_INFO" | awk '/[0-9]/ {print $1}'| grep -v "-"| wc -l)
    fi

    DEBUG "$DEVICE_INFO"
    DEBUG "GPU ID is :${DEVICE_ID} , GPU count is $DEVICE_NUM"
    valid_result=$(check_device_id_is_valid $DEVICE_ID $DEVICE_NUM)
    DEBUG "valid_result: $valid_result"
    if [ "$valid_result" == ${INVALID} ]; then
        err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_ID)
        dump_fail_result_msg "$err_str" "Invalid device id $DEVICE_ID, device id should be [0,$(($DEVICE_NUM-1))]" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi
    DEBUG "Device id $DEVICE_ID is valid."
    # 8.2 Peek the remote directory, for de
    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_remote_cmd "ls -la ${REMOTE_PATH}"
    fi

    # 8.3 Get the GPU Information
    if [ "$ARCH" == "gpu" ];then
        inquery_cmd="nvidia-smi"
    else
        inquery_cmd="pppp"
    fi
    DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
    DEBUG "$DEVICE_INFO"

    # 8.4 Check the GPU processes. If there are processes running on the GPU, exit
    DEBUG "Current device id is $DEVICE_ID, name is:$DEVICE_NAME"
    # 8.4.1 Check if the GPU Name is consistent with the cfg file
    lowercase_device_name=$(echo "$DEVICE_NAME" | tr '[:upper:]' '[:lower:]' | tr '-' ' ')
    cfg_device_name=$(echo "$i" | tr '[:upper:]' '[:lower:]' | tr '_' ' ')
    DEBUG "lowercase_device_name: $lowercase_device_name"
    DEBUG "cfg_device_name: $cfg_device_name"
    do_cmd_silent "echo '$lowercase_device_name' | grep -q '$cfg_device_name'"
    if [ $? -ne 0 ]; then
        err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
        dump_fail_result_msg "$err_str" "Device name is $DEVICE_NAME" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 8.4.2 Check if the GPU is busy
    DEBUG "get the GPU process"
    if [ "$ARCH" == "gpu" ];then
        ROCESS=$(do_remote_cmd_with_return "nvidia-smi --query-compute-apps pid --format=noheader,csv -i ${DEVICE_ID}")
        PROCESS_NUM=$(echo "$PROCESS" | grep -v '^\s*$' | wc -l)
        if [ $PROCESS_NUM -gt 0 ]; then
            err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
            dump_fail_result_msg "$err_str" "There are $PROCESS_NUM processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    else
        gcu_info=$(do_remote_cmd_with_return "pppp --pmon -c 1 -o -i ${DEVICE_ID}")
        gcu_no_process=$(echo "$gcu_info" | grep  'no process running on' | wc -l)
        DEBUG "gcu_no_process: $gcu_no_process"
        if [ $gcu_no_process -eq 0 ]; then
            err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
            dump_fail_result_msg "$err_str" "There are processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    fi

    # 8.5 Run the test case
    DEBUG "No process is running on GPU $DEVICE_ID"
    if [ ${LOG_LEVEL} -ge 2 ]; then
        do_remote_cmd "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\""
    else
        ret_code=$(do_remote_cmd_silent "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\"")
    fi

    # 同步remote dir 到local dir
    DEBUG "sync report file to local"
    do_sync_cmd_silent "${IP}:${REMOTE_PATH}/" "${LOCAL_PATH}/"

    do_cmd_silent "mv -f ${i}.log ${LOG_DIR}/"

    # 9. Save the log file
    INFO "Log saved in $LOG_NAME"

    # Stop the docker container
    # WARN "Stop the docker container, [$CHIPBENCH_DOCKER_NAME]..."
    # do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh stop $ARCH $DOMAIN"

    SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
    dump_success_result_msg "${i}"
done

# End time
end_time=$(date +%s)
# Calculate and print the elapsed time
elapsed_time=$((end_time - start_time))
echo "Elapsed time: $elapsed_time seconds"
echo "All count: ${UINT_NUM}, Success count: $SUCCESS_COUNT, Fail count: $FAIL_COUNT"
echo "All done."

exit 0


本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2244337.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

支持用户注册和登录、发布动态、点赞、评论、私信等功能的社交媒体平台创建!!!

需要整体源代码的可以在我的代码仓下载https://gitcode.com/speaking_me/social-media-platformTest.git 社交媒体平台 描述&#xff1a;社交媒体平台需要支持用户注册、发布动态、点赞、评论、私信等功能。 技术栈&#xff1a; 前端&#xff1a;React, Angular, Vue.js后端…

【MySQL实战45讲笔记】基础篇——redo log 和 binlog

系列文章 基础篇——MySQL 的基础架构 目录 系列文章1. 重要的日志模块&#xff1a;redo log 和 binlog1.1 redo log1.2 binlog1.3 执行器和 InnoDB 引擎内部如何执行更新语句 1. 重要的日志模块&#xff1a;redo log 和 binlog 前面系统的了解了一个查询语句的执行流程&…

【Redis】Redis实现的消息队列

一、用list实现【这是数据类型所以支持持久化】 消息基于redis存储不会因为受jvm内存上限的限制&#xff0c;支持消息的有序性&#xff0c;基于redis的持久化机制&#xff0c;只支持单一消费者订阅&#xff0c;无法避免消息丢失。 二、用PubSub【这不是数据类型&#xff0c;是…

(计算机毕设)基于SpringBoot+Vue的房屋租赁系统的设计与实现

博主可接毕设设计&#xff01;&#xff01;&#xff01; 各种毕业设计源码只要是你有的题目我这里都有源码 摘 要 社会的发展和科学技术的进步&#xff0c;互联网技术越来越受欢迎。网络计算机的生活方式逐渐受到广大人民群众的喜爱&#xff0c;也逐渐进入了每个用户的使用。互…

云原生之运维监控实践-使用Prometheus与Grafana实现对Nginx和Nacos服务的监测

背景 如果你要为应用程序构建规范或用户故事&#xff0c;那么务必先把应用程序每个组件的监控指标考虑进来&#xff0c;千万不要等到项目结束或部署之前再做这件事情。——《Prometheus监控实战》 去年写了一篇在Docker环境下部署若依微服务ruoyi-cloud项目的文章&#xff0c;当…

游戏引擎学习第19天

介绍 这段内容描述了开发者在进行游戏开发时&#xff0c;对于音频同步和平台层的理解和调整的过程。以下是更详细的复述&#xff1a; 开发者表达了他希望今天继续进行的工作内容。他提到&#xff0c;昨天他讲解了一些关于音频的内容&#xff0c;今天他想稍微深入讲解一下他正…

node版本升级,从卸载到使用nvm管理node版本并配置vue环境(学习趟雷版)

查找node版本和安装路径 查找当前node版本 node -v 查看弄得版本安装路径 where node 卸载node&#xff08;没安装过node的可以直接跳过&#xff09; 通过控制面板删除node&#xff0c;按下【winR】键&#xff0c;输入control 控制面板找到默认程序 找到node程序点击卸载 …

每天五分钟机器学习:支持向量机算法数学基础之核函数

本文重点 从现在开始,我们将开启支持向量机算法的学习,不过在学习支持向量机算法之前,我们先来学习一些支持向量机所依赖的数学知识,这会帮助我们更加深刻的理解支持向量机算法,本文我们先来学习核函数。 定义 核函数(Kernel Function)是一种在支持向量机(SVM)、高…

机器学习基础04

目录 1.朴素贝叶斯-分类 1.1贝叶斯分类理论 1.2条件概率 1.3全概率公式 1.4贝叶斯推断 1.5朴素贝叶斯推断 1.6拉普拉斯平滑系数 1.7API 2.决策树-分类 2.1决策树 2.2基于信息增益的决策树建立 2.2.1信息熵 2.2.2信息增益 2.2.3信息增益决策树建立步骤 2.3基于基…

STM32芯片EXIT外部中断的配置与原理以及模板代码(标准库)

配置EXIT外部中断其实就是把GPIO刀NVIC的各个外设配置好 第一步&#xff1a;配置RCC&#xff0c;把我们涉及到的外设的时钟都打开 &#xff08;此处EXTI是默认打开的&#xff0c;而NVIC是内核外设无需配置&#xff09; 第二步&#xff1a;配置GPIO,选择端口为输入模式 第三…

pytest结合allure做接口自动化

这是一个采用pytest框架&#xff0c;结合allure完成接口自动化测试的项目&#xff0c;最后采用allure生成直观美观的测试报告&#xff0c;由于添加了allure的特性&#xff0c;使得测试报告覆盖的内容更全面和阅读起来更方便。 1. 使用pytest构建测试框架&#xff0c;首先配置好…

生成自签名证书并配置 HTTPS 使用自签名证书

生成自签名证书 1. 运行 OpenSSL 命令生成证书和私钥 在终端中输入以下命令&#xff0c;生成自签名证书和私钥文件&#xff1a; sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout self_signed.key -out self_signed.pem-x509&#xff1a;生成自签名证书。…

Linux网络:守护进程

Linux网络&#xff1a;守护进程 会话进程组会话终端 守护进程setsiddaemon 在创建一个网络服务后&#xff0c;往往这个服务进程是一直运行的。但是对于大部分进程来说&#xff0c;如果退出终端&#xff0c;这个终端上创建的所有进程都会退出&#xff0c;这就导致进程的生命周期…

5.4.2-1 编写Java程序在HDFS上创建文件

本次实战涉及使用Java操作Hadoop HDFS&#xff0c;包括创建文件、判断文件存在性及异常处理。通过手动添加依赖、启动HDFS服务&#xff0c;成功在HDFS上创建和检查文件。进一步探索了文件操作的最佳实践&#xff0c;如检查文件存在性以避免重复创建&#xff0c;以及处理HDFS安全…

十六.SpringCloudAlibaba极简入门-整合Grpc代替OpenFeign

前言 他来了他来了&#xff0c;停了快2个月了终于又开始更新文章啦&#xff0c;这次带来的绝对是干货&#xff01;&#xff01;&#xff01;。由于公司项目进行重构的时候考虑到&#xff0c;OpenFeign做为服务通信组件在高并发情况下有一定的性能瓶颈&#xff0c;所以将其替换…

【pytest】pytest注解使用指南

前言&#xff1a;在 pytest 测试框架中&#xff0c;注解&#xff08;通常称为装饰器&#xff09;用于为测试函数、类或方法提供额外的信息或元数据。这些装饰器可以影响测试的执行方式、报告方式以及测试的组织结构。pytest 提供了多种内置的装饰器&#xff0c;以及通过插件扩展…

百度AI人脸检测与对比

1.注册账号 打开网站 https://ai.baidu.com/ &#xff0c;注册百度账号并登录 2.创建应用 3.技术文档 https://ai.baidu.com/ai-doc/FACE/yk37c1u4t 4.Spring Boot简单集成测试 pom.xml 配置&#xff1a; <!--百度AI--> <dependency> <groupId>com.baidu.…

A040-基于springboot的智能停车计费系统设计与实现

&#x1f64a;作者简介&#xff1a;在校研究生&#xff0c;拥有计算机专业的研究生开发团队&#xff0c;分享技术代码帮助学生学习&#xff0c;独立完成自己的网站项目。 代码可以查看文章末尾⬇️联系方式获取&#xff0c;记得注明来意哦~&#x1f339; 赠送计算机毕业设计600…

力扣 LeetCode 222. 完全二叉树的节点个数(Day7:二叉树)

解题思路&#xff1a; 解法一&#xff1a;普通二叉树解法 使用后序遍历 有一行的精简版代码但不利于理解采用的哪一种遍历方式 解法二&#xff1a;利用上完全二叉树的特点 一个指针left&#xff0c;一个指针right left一直向左遍历&#xff0c;right一直向右遍历&#xff…

hhdb数据库介绍(9-21)

计算节点参数说明 checkClusterBeforeDnSwitch 参数说明&#xff1a; PropertyValue参数值checkClusterBeforeDnSwitch是否可见否参数说明集群模式下触发数据节点高可用切换时&#xff0c;是否先判断集群所有成员正常再进行数据节点切换默认值falseReload是否生效是 参数设…