一 部署Spark客户端
1.1 部署spark3客户端
tar -zxvf spark-3.3.1-bin-3.0.0-cdh6.3.2.tgz -C /opt/cloudera/parcels/CDH/lib
cd /opt/cloudera/parcels/CDH/lib
mv spark-3.3.1-bin-3.0.0-cdh6.3.2/ spark3
将 CDH 集群的 spark-env.sh 复制到 /opt/cloudera/parcels/CDH/lib/spark3/conf 下:
cp /etc/spark/conf/spark-env.sh /opt/cloudera/parcels/CDH/lib/spark3/conf
chmod +x /opt/cloudera/parcels/CDH/lib/spark3/conf/spark-env.sh
#修改 spark-env.sh
vim /opt/cloudera/parcels/CDH/lib/spark3/conf/spark-env.sh
export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark3
HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
将 gateway 节点的 hive-site.xml 复制到 spark3/conf 目录下,不需要做变动:
cp /etc/hive/conf/hive-site.xml /opt/cloudera/parcels/CDH/lib/spark3/conf/
1.2 部署Spark2客户端
tar -zxvf spark-2.4.0-bin-hadoop2.7.tgz -C /opt/cloudera/parcels/CDH/lib
cd /opt/cloudera/parcels/CDH/lib
mv spark-2.4.0-bin-hadoop2.7/ spark2
将 CDH 集群的 spark-env.sh 复制到 /opt/cloudera/parcels/CDH/lib/spark2/conf 下:
cp /etc/spark/conf/spark-env.sh /opt/cloudera/parcels/CDH/lib/spark2/conf
chmod +x /opt/cloudera/parcels/CDH/lib/spark2/conf/spark-env.sh
#修改 spark-env.sh
vim /opt/cloudera/parcels/CDH/lib/spark2/conf/spark-env.sh
export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark2
HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
将 gateway 节点的 hive-site.xml 复制到 spark2/conf 目录下,不需要做变动:
cp /etc/hive/conf/hive-site.xml /opt/cloudera/parcels/CDH/lib/spark2/conf/
二 创建spark-sql
2.1 spark3
vim /opt/cloudera/parcels/CDH/bin/spark3-sql
#!/bin/bash
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
SOURCE="${BASH_SOURCE[0]}"
BIN_DIR="$( dirname "$SOURCE" )"
while [ -h "$SOURCE" ]
do
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$BIN_DIR/$SOURCE"
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=$BIN_DIR/../lib
export HADOOP_HOME=$LIB_DIR/hadoop
# Autodetect JAVA_HOME if not defined
. $LIB_DIR/bigtop-utils/bigtop-detect-javahome
exec $LIB_DIR/spark3/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"
配置 spark-sql 快捷方式
chmod +x /opt/cloudera/parcels/CDH/bin/spark3-sql
alternatives --install /usr/bin/spark-sql spark-sql /opt/cloudera/parcels/CDH/bin/spark3-sql 1
2.2 spark2
vim /opt/cloudera/parcels/CDH/bin/spark2-sql
#!/bin/bash
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
SOURCE="${BASH_SOURCE[0]}"
BIN_DIR="$( dirname "$SOURCE" )"
while [ -h "$SOURCE" ]
do
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$BIN_DIR/$SOURCE"
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=$BIN_DIR/../lib
export HADOOP_HOME=$LIB_DIR/hadoop
# Autodetect JAVA_HOME if not defined
. $LIB_DIR/bigtop-utils/bigtop-detect-javahome
exec $LIB_DIR/spark2/bin/spark-submit --class org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver "$@"
配置 spark-sql 快捷方式
chmod +x /opt/cloudera/parcels/CDH/bin/spark2-sql
alternatives --install /usr/bin/spark-sql spark-sql /opt/cloudera/parcels/CDH/bin/spark2-sql 2
三 配置conf
3.1 spark3
cd /opt/cloudera/parcels/CDH/lib/spark3/conf
## 开启日志
mv log4j2.properties.template log4j2.properties
## spark-defaults.conf 配置
cp /opt/cloudera/parcels/CDH/lib/spark/conf/spark-defaults.conf ./
# 修改 spark-defaults.conf
vim /opt/cloudera/parcels/CDH/lib/spark3/conf/spark-defaults.conf
删除 spark.extraListeners、spark.sql.queryExecutionListeners、spark.yarn.jars
添加 spark.yarn.jars=hdfs:///spark/3versionJars/*
hadoop fs -mkdir -p /spark/3versionJars
cd /opt/cloudera/parcels/CDH/lib/spark3/jars
hadoop fs -put *.jar /spark/3versionJars
3.2 spark2
cd /opt/cloudera/parcels/CDH/lib/spark2/conf
## 开启日志
mv log4j2.properties.template log4j2.properties
## spark-defaults.conf 配置
cp /opt/cloudera/parcels/CDH/lib/spark/conf/spark-defaults.conf ./
# 修改 spark-defaults.conf
vim /opt/cloudera/parcels/CDH/lib/spark3/conf/spark-defaults.conf
删除 spark.extraListeners、spark.sql.queryExecutionListeners、spark.yarn.jars
添加 spark.yarn.jars=hdfs:///spark/2versionJars/*
hadoop fs -mkdir -p /spark/2versionJars
cd /opt/cloudera/parcels/CDH/lib/spark2/jars
hadoop fs -put *.jar /spark/2versionJars
四 创建spark-submit
4.1 spark3
vim /opt/cloudera/parcels/CDH/bin/spark3-submit
#!/usr/bin/env bash
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
SOURCE="${BASH_SOURCE[0]}"
BIN_DIR="$( dirname "$SOURCE" )"
while [ -h "$SOURCE" ]
do
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$BIN_DIR/$SOURCE"
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=/opt/cloudera/parcels/CDH/lib
export HADOOP_HOME=$LIB_DIR/hadoop
# Autodetect JAVA_HOME if not defined
. $LIB_DIR/bigtop-utils/bigtop-detect-javahome
# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0
exec $LIB_DIR/spark3/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
配置 spark3-submit 快捷方式:
chmod +755 /opt/cloudera/parcels/CDH/bin/spark3-submit
alternatives --install /usr/bin/spark-submit spark-submit /opt/cloudera/parcels/CDH/bin/spark3-submit 1
4.2 spark2
vim /opt/cloudera/parcels/CDH/bin/spark2-submit
#!/usr/bin/env bash
export HADOOP_CONF_DIR=/etc/hadoop/conf
export YARN_CONF_DIR=/etc/hadoop/conf
SOURCE="${BASH_SOURCE[0]}"
BIN_DIR="$( dirname "$SOURCE" )"
while [ -h "$SOURCE" ]
do
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$BIN_DIR/$SOURCE"
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
done
BIN_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
LIB_DIR=/opt/cloudera/parcels/CDH/lib
export HADOOP_HOME=$LIB_DIR/hadoop
# Autodetect JAVA_HOME if not defined
. $LIB_DIR/bigtop-utils/bigtop-detect-javahome
# disable randomized hash for string in Python 3.3+
export PYTHONHASHSEED=0
exec $LIB_DIR/spark2/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"
配置 spark2-submit 快捷方式:
chmod +755 /opt/cloudera/parcels/CDH/bin/spark2-submit
alternatives --install /usr/bin/spark-submit spark-submit /opt/cloudera/parcels/CDH/bin/spark2-submit 1