需求 :
1, 一堆表格,csv 文件,
2,特定分割符,
3,表头有特殊utf 16进制字符,文件可能是16进制或者utf8的格式,统一utf8
4,读取第一行作为表头处理一些空的字符,还有特殊字符,BOM (Byte Order Mark)
5,转化linux路径为MYSQL可识别路径,先转换下中文到英文,否则mysql不能load
6,循环导入到数据库中。
可以每个表在数据库工具中,像Navicat里导入,1 by 1,也可以用shell 实现批量处理,遍历文件夹下面的所有csv文件,子目录,再下级子目录中的文件。
实现方法
#!/bin/bash
#先来个修改文件名,中文->英文 新文件放开修改
sh rename_ch2en.sh
MYSQL_DB='impcsv'
MYSQL_USER='username'
MYSQL_PASSWORD='pwd'
MYSQL_HOST="localhost"
TABLE='your_table_name'
ROOT_DIR="/d/urfolder/datafile"
utf8_suffix='_utf8.csv' #设置一个后缀好转化文件给个标记
#for FILE in "${CSV_FILES[@]}"; do
# echo "Importing $FILE into $DATABASE:$TABLE"
# mysql -u "$USER" -p"$PASS" "$DATABASE" -e "LOAD DATA LOCAL INFILE '$FILE' INTO TABLE $TABLE FIELDS TERMINATED BY ';' ENCLOSED BY '\"' LINES TERMINATED BY '\n' IGNORE 1 LINES;"
#done
# 遍历目录及子目录中的CSV文件,先遍历清除uft8的文件,不清楚会连续转码,出现乱码情况
#cd ROOT_DIR
echo $6
find . -type f -name '*utf8*' -exec rm -f {} +
find "$ROOT_DIR" -type f -name "*.csv" | while read -r csv_file; do
# 获取文件相对路径和目录名称,确保处理空格和特殊字符,直接替换root dir为空 D:/urfolder/datafile/test/firstfolder/test1.csv
relative_dir=$(dirname "$csv_file" | sed "s|$ROOT_DIR/||;s/\s+/_/g") #test/firstfolder
# 获取子文件夹名称和CSV文件名称
folder_name=$(basename "$relative_dir") # 最后一级目录名 firstfolder
echo "folder_name = $folder_name"
file_name_org=$(basename "$csv_file" .csv)
#连续空格替换为下划线
file_name=$(echo "$file_name_org" | sed -E "s/\s+/_/g")
echo "file_name = $file_name" #file_name = test1
#连续空格替换为下划线
rel_dir_sed=$(echo "$relative_dir" | sed -E "s/\//_/g;s/#//g;s/\s\+/_/g")
table_name="${rel_dir_sed}_${file_name}"
echo "table_name = $table_name" #table_name = firstfolder_test1
conv_file="$file_name"$utf8_suffix
echo "utf8 file = $conv_file"
# 切换到目标目录
target_dir="$ROOT_DIR/$relative_dir"
# 使用 cygpath 转换为 Windows 风格路径,mysql 不能识别
#wnds_tgt_dir=$(cygpath -w "$target_dir")
wnds_tgt_dir=$(echo "$target_dir" | sed 's|/d/|D:/|')
if [[ $(pwd) != "$target_dir" ]]; then
echo "working on folder $relative_dir, but now we are in $(pwd), let's jump to folder $relative_dir"
cd "$target_dir" || { echo "无法进入目标文件夹: $target_dir"; continue; }
fi
echo "current folder: $(pwd)"
# 检测文件编码
file_encoding=$(file -bi "$csv_file" | awk -F'=' '{print $2}')
# 根据文件编码判断是否需要转换
if [[ "$file_encoding" == "utf-8" ]]; then
echo "文件编码是 UTF-8,不需要转换"
cp "$csv_file" "./$conv_file"
elif [[ "$file_encoding" == "utf-16le" ]]; then
echo "文件编码是 UTF-16LE,正在转换为 UTF-8..."
iconv -f UTF-16LE -t UTF-8 "$csv_file" > "./$conv_file"
echo "转换完成,保存为 ./$conv_file"
else
echo "文件编码未知:$file_encoding,无法处理"
exit 1
fi
echo "starting convert encoding character from utf16 to utf8...."
#iconv -f UTF-16LE -t UTF-8 $csv_file > ./$conv_file
##chmod 777 ./$conv_file.csv
# 创建表结构:根据CSV文件的列名创建表
# 首先获取CSV文件的列名
#echo "$(ls -l)"
echo "......start getting the column name from $conv_file file......"
#echo "$(head -n 1 "./$conv_file" | sed -E "s/ +/_/g; s/#//g; s/\"//g")"
columns=$(head -n 1 "./$conv_file" | sed -E "s/ +/_/g; s/#//g; s/\"//g")
# 初始化最终结果变量
final_columns=""
# 将列字符串按逗号分割,并逐一处理每个字段
IFS=';' read -ra fields <<< "$columns" # 使用 IFS 读取为数组
for field in "${fields[@]}"; do
#field=$(echo "$field" | xargs) # 去掉前后空格
#printf 'Field: "%s"\n' "$field" # 打印原始字段 00000000: efbb bf54 494d 450a ...TIME.
#echo "$field" | xxd # 打印字段的 HEX 编码 包含 BOM (Byte Order Mark),这是导致匹配失败的原因。BOM 是一种 UTF-8 文件的字节标记,通常出现在文件开头,导致在字段前意外插入不可见字符 \xEF\xBB\xBF
# 去掉前后空格、移除 BOM 和其他特殊字符
field=$(echo "$field" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | tr -d '\r' | sed 's/^\xEF\xBB\xBF//')
if [[ -z "$field" ]]; then
continue # 如果是空行,跳过
fi
if [[ "$field" == "TIME" ]]; then
final_columns+="$field varchar(255), "
else
final_columns+="$field float, "
fi
#echo $field
done
# 去掉最终结果末尾多余的逗号和空格
final_columns="${final_columns%, }"
# 去掉列名末尾的逗号,末位字段没有类型,添加float
#columns="${columns%,} float"
#echo "columns now we get as below : $final_columns"
# 创建SQL语句来创建表
echo "start creating table...."
drop_table_sql="drop TABLE IF EXISTS \`$table_name\`; "
create_table_sql="CREATE TABLE \`$table_name\` ($final_columns);"
echo "$create_table_sql"
#打开导入文件环境变量
upd_var_sql="set global local_infile='ON';"
# 导入CSV数据到数据库表
load_data_sql="LOAD DATA LOCAL INFILE '$wnds_tgt_dir/$conv_file' INTO TABLE \`$table_name\` FIELDS TERMINATED BY ';' ENCLOSED BY '\n' LINES TERMINATED BY '\n' IGNORE 1 ROWS;"
echo "load_data_sql \\n $load_data_sql"
exec_sql="$drop_table_sql $create_table_sql $upd_var_sql $load_data_sql"
# 执行创建表的SQL语句
echo "$exec_sql" | mysql --local-infile=1 -u "$MYSQL_USER" -p"$MYSQL_PASSWORD" -h "$MYSQL_HOST" "$MYSQL_DB"
echo "Data from $csv_file has been successfully imported into table \`$table_name\`"
done
echo "Import completed."