flink iceberg写数据到hdfs，hive同步读取

1、组件版本

名称	版本
hadoop	3.4.1
flink	1.20.1
hive	4.0.1
kafka	3.9.0
zookeeper	3.9.3
tez	0.10.4
spark（hadoop3）	3.5.4
jdk	11.0.13
maven	3.9.9

环境变量配置

vim编辑保存后，要执行source /etc/profile

LD_LIBRARY_PATH=/usr/local/lib

export LD_LIBRARY_PATH

# Java环境

export JAVA_HOME=/cluster/jdk

export CLASSPATH=.:$JAVA_HOME/lib:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar

export TEZ_HOME=/cluster/tez/

export TEZ_CONF_DIR=$TEZ_HOME/conf

export TEZ_JARS=$TEZ_HOME/*.jar:$TEZ_HOME/lib/*.jar

# Hadoop生态

export HADOOP_HOME=/cluster/hadoop3

export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

HADOOP_CLASSPATH=`hadoop classpath`

export HADOOP_CLASSPATH=$TEZ_CONF_DIR:$TEZ_JARS:$HADOOP_CLASSPATH

export HDFS_NAMENODE_USER=root

export HDFS_DATANODE_USER=root

export HDFS_SECONDARYNAMENODE_USER=root

export YARN_RESOURCEMANAGER_USER=root

export YARN_NODEMANAGER_USER=root

# Hive配置

export HIVE_HOME=/cluster/hive

export HIVE_CONF_DIR=$HIVE_HOME/conf

# Spark配置

export SPARK_HOME=/cluster/spark

export SPARK_LOCAL_IP=10.10.10.99

export SPARK_CONF_DIR=$SPARK_HOME/conf

# Flink配置

export FLINK_HOME=/cluster/flink

# ZooKeeper/Kafka

export ZOOKEEPER_HOME=/cluster/zookeeper

export KAFKA_HOME=/cluster/kafka

# 其他工具

export FLUME_HOME=/cluster/flume

export M2_HOME=/cluster/maven

# 动态链接库

export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native/:$LD_LIBRARY_PATH

# 环境变量合并

export PATH=$PATH:$HIVE_HOME/bin:$JAVA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$M2_HOME/bin:$FLINK_HOME/bin:$ZOOKEEPER_HOME/bin

export LC_ALL=zh_CN.UTF-8

export LANG=zh_CN.UTF-8

2、hadoop配置

hadoop-env.sh

#

# Licensed to the Apache Software Foundation (ASF) under one

# or more contributor license agreements. See the NOTICE file

# distributed with this work for additional information

# regarding copyright ownership. The ASF licenses this file

# to you under the Apache License, Version 2.0 (the

# "License"); you may not use this file except in compliance

# with the License. You may obtain a copy of the License at

#

# http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# Set Hadoop-specific environment variables here.

##

## THIS FILE ACTS AS THE MASTER FILE FOR ALL HADOOP PROJECTS.

## SETTINGS HERE WILL BE READ BY ALL HADOOP COMMANDS. THEREFORE,

## ONE CAN USE THIS FILE TO SET YARN, HDFS, AND MAPREDUCE

## CONFIGURATION OPTIONS INSTEAD OF xxx-env.sh.

##

## Precedence rules:

##

## {yarn-env.sh|hdfs-env.sh} > hadoop-env.sh > hard-coded defaults

##

## {YARN_xyz|HDFS_xyz} > HADOOP_xyz > hard-coded defaults

##

# Many of the options here are built from the perspective that users

# may want to provide OVERWRITING values on the command line.

# For example:

#

# JAVA_HOME=/usr/java/testing hdfs dfs -ls

#

# Therefore, the vast majority (BUT NOT ALL!) of these defaults

# are configured for substitution and not append. If append

# is preferable, modify this file accordingly.

###

# Generic settings for HADOOP

###

# Technically, the only required environment variable is JAVA_HOME.

# All others are optional. However, the defaults are probably not

# preferred. Many sites configure these options outside of Hadoop,

# such as in /etc/profile.d

# The java implementation to use. By default, this environment

# variable is REQUIRED on ALL platforms except OS X!

# export JAVA_HOME=

# Location of Hadoop. By default, Hadoop will attempt to determine

# this location based upon its execution path.

# export HADOOP_HOME=

# Location of Hadoop's configuration information. i.e., where this

# file is living. If this is not defined, Hadoop will attempt to

# locate it based upon its execution path.

#

# NOTE: It is recommend that this variable not be set here but in

# /etc/profile.d or equivalent. Some options (such as

# --config) may react strangely otherwise.

#

# export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop

# The maximum amount of heap to use (Java -Xmx). If no unit

# is provided, it will be converted to MB. Daemons will

# prefer any Xmx setting in their respective _OPT variable.

# There is no default; the JVM will autoscale based upon machine

# memory size.

# export HADOOP_HEAPSIZE_MAX=

# The minimum amount of heap to use (Java -Xms). If no unit

# is provided, it will be converted to MB. Daemons will

# prefer any Xms setting in their respective _OPT variable.

# There is no default; the JVM will autoscale based upon machine

# memory size.

# export HADOOP_HEAPSIZE_MIN=

# Enable extra debugging of Hadoop's JAAS binding, used to set up

# Kerberos security.

# export HADOOP_JAAS_DEBUG=true

# Extra Java runtime options for all Hadoop commands. We don't support

# IPv6 yet/still, so by default the preference is set to IPv4.

# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"

# For Kerberos debugging, an extended option set logs more information

# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"

# Some parts of the shell code may do special things dependent upon

# the operating system. We have to set this here. See the next

# section as to why....

export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}

# Extra Java runtime options for some Hadoop commands

# and clients (i.e., hdfs dfs -blah). These get appended to HADOOP_OPTS for

# such commands. In most cases, # this should be left empty and

# let users supply it on the command line.

# export HADOOP_CLIENT_OPTS=""

#

# A note about classpaths.

#

# By default, Apache Hadoop overrides Java's CLASSPATH

# environment variable. It is configured such

# that it starts out blank with new entries added after passing

# a series of checks (file/dir exists, not already listed aka

# de-deduplication). During de-deduplication, wildcards and/or

# directories are *NOT* expanded to keep it simple. Therefore,

# if the computed classpath has two specific mentions of

# awesome-methods-1.0.jar, only the first one added will be seen.

# If two directories are in the classpath that both contain

# awesome-methods-1.0.jar, then Java will pick up both versions.

# An additional, custom CLASSPATH. Site-wide configs should be

# handled via the shellprofile functionality, utilizing the

# hadoop_add_classpath function for greater control and much

# harder for apps/end-users to accidentally override.

# Similarly, end users should utilize ${HOME}/.hadooprc .

# This variable should ideally only be used as a short-cut,

# interactive way for temporary additions on the command line.

# export HADOOP_CLASSPATH="/some/cool/path/on/your/machine"

# Should HADOOP_CLASSPATH be first in the official CLASSPATH?

# export HADOOP_USER_CLASSPATH_FIRST="yes"

# If HADOOP_USE_CLIENT_CLASSLOADER is set, the classpath along

# with the main jar are handled by a separate isolated

# client classloader when 'hadoop jar', 'yarn jar', or 'mapred job'

# is utilized. If it is set, HADOOP_CLASSPATH and

# HADOOP_USER_CLASSPATH_FIRST are ignored.

# export HADOOP_USE_CLIENT_CLASSLOADER=true

# HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES overrides the default definition of

# system classes for the client classloader when HADOOP_USE_CLIENT_CLASSLOADER

# is enabled. Names ending in '.' (period) are treated as package names, and

# names starting with a '-' are treated as negative matches. For example,

# export HADOOP_CLIENT_CLASSLOADER_SYSTEM_CLASSES="-org.apache.hadoop.UserClass,java.,javax.,org.apache.hadoop."

# Enable optional, bundled Hadoop features

# This is a comma delimited list. It may NOT be overridden via .hadooprc

# Entries may be added/removed as needed.

# export HADOOP_OPTIONAL_TOOLS="hadoop-kafka,hadoop-aws,hadoop-azure-datalake,hadoop-aliyun,hadoop-azure"

###

# Options for remote shell connectivity

###

# There are some optional components of hadoop that allow for

# command and control of remote hosts. For example,

# start-dfs.sh will attempt to bring up all NNs, DNS, etc.

# Options to pass to SSH when one of the "log into a host and

# start/stop daemons" scripts is executed

# export HADOOP_SSH_OPTS="-o BatchMode=yes -o StrictHostKeyChecking=no -o ConnectTimeout=10s"

# The built-in ssh handler will limit itself to 10 simultaneous connections.

# For pdsh users, this sets the fanout size ( -f )

# Change this to increase/decrease as necessary.

# export HADOOP_SSH_PARALLEL=10

# Filename which contains all of the hosts for any remote execution

# helper scripts # such as workers.sh, start-dfs.sh, etc.

# export HADOOP_WORKERS="${HADOOP_CONF_DIR}/workers"

###

# Options for all daemons

###

#

#

# Many options may also be specified as Java properties. It is

# very common, and in many cases, desirable, to hard-set these

# in daemon _OPTS variables. Where applicable, the appropriate

# Java property is also identified. Note that many are re-used

# or set differently in certain contexts (e.g., secure vs

# non-secure)

#

# Where (primarily) daemon log files are stored.

# ${HADOOP_HOME}/logs by default.

# Java property: hadoop.log.dir

# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

# A string representing this instance of hadoop. $USER by default.

# This is used in writing log and pid files, so keep that in mind!

# Java property: hadoop.id.str

# export HADOOP_IDENT_STRING=$USER

# How many seconds to pause after stopping a daemon

# export HADOOP_STOP_TIMEOUT=5

# Where pid files are stored. /tmp by default.

# export HADOOP_PID_DIR=/tmp

# Default log4j setting for interactive commands

# Java property: hadoop.root.logger

# export HADOOP_ROOT_LOGGER=INFO,console

# Default log4j setting for daemons spawned explicitly by

# --daemon option of hadoop, hdfs, mapred and yarn command.

# Java property: hadoop.root.logger

# export HADOOP_DAEMON_ROOT_LOGGER=INFO,RFA

# Default log level and output location for security-related messages.

# You will almost certainly want to change this on a per-daemon basis via

# the Java property (i.e., -Dhadoop.security.logger=foo). (Note that the

# defaults for the NN and 2NN override this by default.)

# Java property: hadoop.security.logger

# export HADOOP_SECURITY_LOGGER=INFO,NullAppender

# Default process priority level

# Note that sub-processes will also run at this level!

# export HADOOP_NICENESS=0

# Default name for the service level authorization file

# Java property: hadoop.policy.file

# export HADOOP_POLICYFILE="hadoop-policy.xml"

#

# NOTE: this is not used by default! <-----

# You can define variables right here and then re-use them later on.

# For example, it is common to use the same garbage collection settings

# for all the daemons. So one could define:

#

# export HADOOP_GC_SETTINGS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps"

#

# .. and then use it as per the b option under the namenode.

###

# Secure/privileged execution

###

#

# Out of the box, Hadoop uses jsvc from Apache Commons to launch daemons

# on privileged ports. This functionality can be replaced by providing

# custom functions. See hadoop-functions.sh for more information.

#

# The jsvc implementation to use. Jsvc is required to run secure datanodes

# that bind to privileged ports to provide authentication of data transfer

# protocol. Jsvc is not required if SASL is configured for authentication of

# data transfer protocol using non-privileged ports.

# export JSVC_HOME=/usr/bin

#

# This directory contains pids for secure and privileged processes.

#export HADOOP_SECURE_PID_DIR=${HADOOP_PID_DIR}

#

# This directory contains the logs for secure and privileged processes.

# Java property: hadoop.log.dir

# export HADOOP_SECURE_LOG=${HADOOP_LOG_DIR}

#

# When running a secure daemon, the default value of HADOOP_IDENT_STRING

# ends up being a bit bogus. Therefore, by default, the code will

# replace HADOOP_IDENT_STRING with HADOOP_xx_SECURE_USER. If one wants

# to keep HADOOP_IDENT_STRING untouched, then uncomment this line.

# export HADOOP_SECURE_IDENT_PRESERVE="true"

###

# NameNode specific parameters

###

# Default log level and output location for file system related change

# messages. For non-namenode daemons, the Java property must be set in

# the appropriate _OPTS if one wants something other than INFO,NullAppender

# Java property: hdfs.audit.logger

# export HDFS_AUDIT_LOGGER=INFO,NullAppender

# Specify the JVM options to be used when starting the NameNode.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# a) Set JMX options

# export HDFS_NAMENODE_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.port=1026"

#

# b) Set garbage collection logs

# export HDFS_NAMENODE_OPTS="${HADOOP_GC_SETTINGS} -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"

#

# c) ... or set them directly

# export HDFS_NAMENODE_OPTS="-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -Xloggc:${HADOOP_LOG_DIR}/gc-rm.log-$(date +'%Y%m%d%H%M')"

# this is the default:

# export HDFS_NAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"

###

# SecondaryNameNode specific parameters

###

# Specify the JVM options to be used when starting the SecondaryNameNode.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# This is the default:

# export HDFS_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=INFO,RFAS"

###

# DataNode specific parameters

###

# Specify the JVM options to be used when starting the DataNode.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# This is the default:

# export HDFS_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS"

# On secure datanodes, user to run the datanode as after dropping privileges.

# This **MUST** be uncommented to enable secure HDFS if using privileged ports

# to provide authentication of data transfer protocol. This **MUST NOT** be

# defined if SASL is configured for authentication of data transfer protocol

# using non-privileged ports.

# This will replace the hadoop.id.str Java property in secure mode.

# export HDFS_DATANODE_SECURE_USER=hdfs

# Supplemental options for secure datanodes

# By default, Hadoop uses jsvc which needs to know to launch a

# server jvm.

# export HDFS_DATANODE_SECURE_EXTRA_OPTS="-jvm server"

###

# NFS3 Gateway specific parameters

###

# Specify the JVM options to be used when starting the NFS3 Gateway.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_NFS3_OPTS=""

# Specify the JVM options to be used when starting the Hadoop portmapper.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_PORTMAP_OPTS="-Xmx512m"

# Supplemental options for priviliged gateways

# By default, Hadoop uses jsvc which needs to know to launch a

# server jvm.

# export HDFS_NFS3_SECURE_EXTRA_OPTS="-jvm server"

# On privileged gateways, user to run the gateway as after dropping privileges

# This will replace the hadoop.id.str Java property in secure mode.

# export HDFS_NFS3_SECURE_USER=nfsserver

###

# ZKFailoverController specific parameters

###

# Specify the JVM options to be used when starting the ZKFailoverController.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_ZKFC_OPTS=""

###

# QuorumJournalNode specific parameters

###

# Specify the JVM options to be used when starting the QuorumJournalNode.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_JOURNALNODE_OPTS=""

###

# HDFS Balancer specific parameters

###

# Specify the JVM options to be used when starting the HDFS Balancer.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_BALANCER_OPTS=""

###

# HDFS Mover specific parameters

###

# Specify the JVM options to be used when starting the HDFS Mover.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_MOVER_OPTS=""

###

# Router-based HDFS Federation specific parameters

# Specify the JVM options to be used when starting the RBF Routers.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_DFSROUTER_OPTS=""

###

# HDFS StorageContainerManager specific parameters

###

# Specify the JVM options to be used when starting the HDFS Storage Container Manager.

# These options will be appended to the options specified as HADOOP_OPTS

# and therefore may override any similar flags set in HADOOP_OPTS

#

# export HDFS_STORAGECONTAINERMANAGER_OPTS=""

###

# Advanced Users Only!

###

#

# When building Hadoop, one can add the class paths to the commands

# via this special env var:

# export HADOOP_ENABLE_BUILD_PATHS="true"

#

# To prevent accidents, shell commands be (superficially) locked

# to only allow certain users to execute certain subcommands.

# It uses the format of (command)_(subcommand)_USER.

#

# For example, to limit who can execute the namenode command,

# export HDFS_NAMENODE_USER=hdfs

###

# Registry DNS specific parameters

###

# For privileged registry DNS, user to run as after dropping privileges

# This will replace the hadoop.id.str Java property in secure mode.

# export HADOOP_REGISTRYDNS_SECURE_USER=yarn

# Supplemental options for privileged registry DNS

# By default, Hadoop uses jsvc which needs to know to launch a

# server jvm.

# export HADOOP_REGISTRYDNS_SECURE_EXTRA_OPTS="-jvm server"

#export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}

export HDFS_NAMENODE_USER=root

export HDFS_DATANODE_USER=root

export HDFS_SECONDARYNAMENODE_USER=root

export YARN_RESOURCEMANAGER_USER=root

export YARN_NODEMANAGER_USER=root

export HIVE_HOME=/cluster/hive

export HIVE_CONF_DIR=$HIVE_HOME/conf

export JAVA_HOME=/cluster/jdk

export CLASSPATH=.:$JAVA_HOME/lib:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar

export HADOOP_HOME=/cluster/hadoop3

export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native/:$LD_LIBRARY_PATH

export SPARK_HOME=/cluster/spark

export SPARK_LOCAL_IP=10.10.10.99

export SPARK_CONF_DIR=$SPARK_HOME/conf

export PATH=$PATH:$JAVA_HOME/jre/bin:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$SPARK_HOME/bin:$SPARK_HOME/sbin

core-site.xml

hdfs-site.xml

mapred-site.xml

yarn-site.xml

3、hive配置

hive-env.sh

hive-site.xml

<configuration>





<property>

    <name>javax.jdo.option.ConnectionURL</name>

    <value>jdbc:postgresql://10.7.215.181:3024/hive?createDatabaseIfNotExist=true</value>

</property>



<property>

    <name>javax.jdo.option.ConnectionDriverName</name>

    <value>org.postgresql.Driver</value>

</property>



<property>

    <name>javax.jdo.option.ConnectionUserName</name>

    <value>postgres</value>

</property>



<property>

    <name>javax.jdo.option.ConnectionPassword</name>

    <value>postgres</value>

</property>



<property>

    <name>hive.exec.scratchdir</name>

    <value>hdfs://10.10.10.99:9000/cluster/hive/scratchdir</value>

</property>







<property>

    <name>hive.querylog.location</name>

    <value>hdfs://10.10.10.99:9000/cluster/hive/scratchdir</value>

</property>



<property>

    <name>metastore.metastore.event.db.notification.api.auth</name>

    <value>false</value>

</property>

  

<property>

    <name>hive.downloaded.resources.dir</name>

    <value>/cluster/hive/downloaded</value>

</property>



<property>

    <name>hive.>hive.server2.enable.doAs.logging.operation.enabled</name>

    <value>true</value>

</property>



<property>

    <name>hive.server2.logging.operation.log.location</name>

    <value>/cluster/hive/logs</value>

</property>



<property>

    <name>hive.metastore.uris</name>

    <value>thrift://10.10.10.99:9083</value>

</property>



<property>

    <name>hive.metastore.client.socket.timeout</name>

    <value>3000</value>

</property>



<property>

    <name>hive.metastore.warehouse.dir</name>

    <value>hdfs://10.10.10.99:9000/cluster/hive/warehouse</value>

</property>



<property>

    <name>spark.sql.warehouse.dir</name>

    <value>hdfs://10.10.10.99:9000/cluster/hive/sparksql</value>

</property>



<property>

    <name>hive.auto.convert.join</name>

    <value>true</value>

</property>



<property>

    <name>hive.auto.convert.join.noconditionaltask.size</name>

    <value>20971520</value>

</property>



<property>

    <name>hive.optimize.bucketmapjoin.sortedmerge</name>

    <value>false</value>

</property>



<property>

    <name>hive.smbjoin.cache.rows</name>

    <value>10000</value>

</property>



<property>

    <name>mapred.reduce.tasks</name>

    <value>-1</value>

</property>



<property>

    <name>hive.exec.reducers.bytes.per.reducer</name>

    <value>67108864</value>

</property>



<property>

    <name>hive.exec.copyfile.maxsize</name>

    <value>33554432</value>

</property>



<property>

    <name>hive.exec.reducers.max</name>

    <value>1099</value>

</property>



<property>

    <name>hive.vectorized.groupby.checkinterval</name>

    <value>4096</value>

</property>



<property>

    <name>hive.vectorized.groupby.flush.percent</name>

    <value>0.1</value>

</property>



<property>

    <name>hive.compute.query.using.stats</name>

    <value>false</value>

</property>





<property>

    <name>hive.vectorized.execution.enabled</name>

    <value>false</value>

</property>



<property>

    <name>hive.vectorized.execution.reduce.enabled</name>

    <value>true</value>

</property>



<property>

    <name>hive.vectorized.use.vectorized.input.format</name>

    <value>true</value>

</property>



<property>

    <name>hive.vectorized.use.vector.serde.deserialize</name>

    <value>false</value>

</property>



<property>

    <name>hive.vectorized.adaptor.usage.mode</name>

    <value>chosen</value>

</property>



<property>

    <name>hive.merge.mapfiles</name>

    <value>true</value>

</property>



<property>

    <name>hive.merge.mapredfiles</name>

    <value>false</value>

</property>



<property>

    <name>hive.cbo.enable</name>

    <value>false</value>

</property>



<property>

    <name>hive.fetch.task.conversion</name>

    <value>minimal</value>

</property>



<property>

    <name>hive.fetch.task.conversion.threshold</name>

    <value>268435456</value>

</property>



<property>

    <name>hive.limit.pushdown.memory.usage</name>

    <value>0.1</value>

</property>



<property>

    <name>hive.merge.smallfiles.avgsize</name>

    <value>134217728</value>

</property>



<property>

    <name>hive.merge.size.per.task</name>

    <value>268435456</value>

</property>



<property>

    <name>hive.optimize.reducededuplication</name>

    <value>true</value>

</property>



<property>

    <name>hive.optimize.reducededuplication.min.reducer</name>

    <value>4</value>

</property>



<property>

    <name>hive.map.aggr</name>

    <value>true</value>

</property>



<property>

    <name>hive.map.aggr.hash.percentmemory</name>

    <value>0.5</value>

</property>



<property>

    <name>hive.stats.column.autogather</name>

    <value>false</value>

</property>

  



<property>

    <name>hive.execution.engine</name>

    <value>tez</value>

</property>

  

<property>

    <name>spark.executor.memory</name>

    <value>2572261785b</value>

</property>



<property>

    <name>spark.driver.memory</name>

    <value>3865470566b</value>

</property>



<property>

    <name>spark.executor.cores</name>

    <value>4</value>

</property>



<property>

    <name>spark.yarn.driver.memoryOverhead</name>

    <value>409m</value>

</property>



<property>

    <name>spark.yarn.executor.memoryOverhead</name>

    <value>432m</value>

</property>



<property>

    <name>spark.dynamicAllocation.enabled</name>

    <value>true</value>

</property>



<property>

    <name>spark.dynamicAllocation.initialExecutors</name>

    <value>1</value>

</property>



<property>

    <name>spark.dynamicAllocation.maxExecutors</name>

    <value>2147483647</value>

</property>



<property>

    <name>hive.metastore.execute.setugi</name>

    <value>true</value>

</property>



<property>

    <name>hive.support.concurrency</name>

    <value>true</value>

</property>



<property>

    <name>hive.zookeeper.quorum</name>

    <value>10.10.10.99</value>

</property>



<property>

    <name>hive.zookeeper.client.port</name>

    <value>2181</value>

</property>



<property>

    <name>hive.zookeeper.namespace</name>

    <value>hive_zookeeper_namespace_hive</value>

</property>



<property>

    <name>hive.cluster.delegation.token.store.class</name>

    <value>org.apache.hadoop.hive.thrift.MemoryTokenStore</value>

</property>



<property>

    <name>hive.server2.enable.doAs</name>

    <value>false</value>

</property>



<property>

    <name>spark.shuffle.service.enabled</name>

    <value>true</value>

</property>



<property>

    <name>hive.strict.checks.type.safety</name>

    <value>true</value>

</property>



<property>

    <name>hive.strict.checks.cartesian.product</name>

    <value>false</value>

</property>



<property>

    <name>hive.strict.checks.bucketing</name>

    <value>true</value>

</property>



<property>

    <name>hive.server2.thrift.port</name>

    <value>10000</value>

</property>

  

<property>

    <name>hive.server2.thrift.bind.host</name>

    <value>10.10.10.99</value>

</property>



<property>

    <name>hive.server2.webui.host</name>

    <value>10.10.10.99</value>

</property>



<property>

    <name>hive.server2.webui.port</name>

    <value>10002</value>

</property>



<property>

    <name>hive.metastore.schema.verification</name>

    <value>false</value>

</property>



<property>

    <name>hive.cli.print.header</name>

    <value>true</value>

</property>

<property>

    <name>hive.cli.print.current.db</name>

    <value>true</value>

</property>



<property>

    <name>hive.exec.dynamic.partition.mode</name>

    <value>nonstrict</value>

</property>



<property>

    <name>hive.txn.manager</name>

    <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>

</property>



<property>

    <name>hive.compactor.initiator.on</name>

    <value>true</value>

</property>



<property>

    <name>hive.compactor.worker.threads</name>

    <value>1</value>

</property>





  <property>

    <name>hive.in.test</name>

    <value>true</value>

</property>



<property>

    <name>metastore.client.capability.check</name>

    <value>false</value>

</property>



<property>

    <name>iceberg.engine.hive.enabled</name>

    <value>true</value>

</property>



<property>

    <name>hive.iceberg.optimize.shared.scan</name>

    <value>true</value>

</property>



<property>

    <name>hive.iceberg.enabled</name>

    <value>true</value>

</property>



<property>

    <name>hive.tez.exec.inplace.progress</name>

    <value>false</value>

</property>



<property>

    <name>tez.runtime.optimize.local.fetch</name>

    <value>true</value>

</property>



<property>

    <name>hive.exec.submit.local.task.via.child</name>

    <value>false</value>

</property>



<property>

    <name>mapreduce.framework.name</name>

    <value>yarn</value>

</property>



<property>

    <name>tez.local.mode</name>

    <value>false</value>

</property>



<property>

    <name>tez.lib.uris</name>

    <value>hdfs://10.10.10.99:9000/cluster/tez/libs</value>

</property>



<property>

    <name>mapreduce.reduce.memory.mb</name>

    <value>1024</value>

</property>



<property>

    <name>mapreduce.reduce.java.opts</name>

    <value>-Xmx819m</value>

</property>



<property>

    <name>hive.exec.cleanup.scratchdir</name>

    <value>true</value>

</property>



<property>

    <name>hive.exec.cleanup.scratchdir.immediate</name>

    <value>true</value>

</property>



<property>

    <name>hive.iceberg.write.format</name>

    <value>parquet</value>

</property>

<property>

    <name>hive.iceberg.auto.create.snapshot</name>

    <value>true</value>

</property>

<property>

    <name>hive.tez.container.size</name>

    <value>1024</value>

</property>

<property>

    <name>hive.cli.tez.session.async</name>

    <value>false</value>

</property>

</configuration>

4、flink配置

config.yaml

env:

java:

    opts:

      all: --add-exports=java.base/sun.net.util=ALL-UNNAMED --add-exports=java.rmi/sun.rmi.registry=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED --add-exports=java.security.jgss/sun.security.krb5=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.time=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.locks=ALL-UNNAMED

      #jobmanager: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5006"

      #taskmanager: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005"

# jobmanager debug端口

#env.java.opts.jobmanager: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5006"

# taskmanager debug端口

#env.java.opts.taskmanager: "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005"

#==============================================================================

# Common

#==============================================================================

# Common

#==============================================================================

jobmanager:

bind-host: 0.0.0.0

rpc:

    address: 0.0.0.0

    port: 6123

memory:

    process:

      size: 1600m

execution:

    failover-strategy: region

archive:

    fs:

      dir: hdfs://10.10.10.99:9000/flink/completed-jobs/

taskmanager:

bind-host: 0.0.0.0

host: 0.0.0.0

numberOfTaskSlots: 100

memory:

    process:

      size: 1728m

    network:

      fraction: 0.1

      min: 64mb

      max: 1gb

parallelism:

default: 1

fs:

default-scheme: hdfs://10.10.10.99:9000

#==============================================================================

# High Availability zookeeper没有开启认证，应该尝试下怎么开启zookeeper的认证方式

#==============================================================================

high-availability:

# The high-availability mode. Possible options are 'NONE' or 'zookeeper'.

type: zookeeper

# The path where metadata for master recovery is persisted. While ZooKeeper stores

# the small ground truth for checkpoint and leader election, this location stores

# the larger objects, like persisted dataflow graphs.

#

# Must be a durable file system that is accessible from all nodes

# (like HDFS, S3, Ceph, nfs, ...)

storageDir: hdfs:///flink/ha/

zookeeper:

    # The list of ZooKeeper quorum peers that coordinate the high-availability

    # setup. This must be a list of the form:

    # "host1:clientPort,host2:clientPort,..." (default clientPort: 2181)

    quorum: localhost:2181

    client:

      # ACL options are based on https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_BuiltinACLSchemes

      # It can be either "creator" (ZOO_CREATE_ALL_ACL) or "open" (ZOO_OPEN_ACL_UNSAFE)

      # The default value is "open" and it can be changed to "creator" if ZK security is enabled

      acl: open

#==============================================================================

# Fault tolerance and checkpointing

#==============================================================================

# The backend that will be used to store operator state checkpoints if

# checkpointing is enabled. Checkpointing is enabled when execution.checkpointing.interval > 0.

# # Execution checkpointing related parameters. Please refer to CheckpointConfig and CheckpointingOptions for more details.

execution:

checkpointing:

    interval: 3min

    externalized-checkpoint-retention: DELETE_ON_CANCELLATION

    max-concurrent-checkpoints: 1

    min-pause: 0s

    mode: EXACTLY_ONCE

    timeout: 10min

    tolerable-failed-checkpoints: 0

    unaligned: false

state:

backend:

    type: hashmap

    incremental: false

checkpoints:

    dir: hdfs://10.10.10.99:9000/flink/flink-checkpoints

savepoints:

    dir: hdfs://10.10.10.99:9000/flink/flink-savepoints

#==============================================================================

# Rest & web frontend

#==============================================================================

rest:

address: 0.0.0.0

bind-address: 0.0.0.0

web:

    submit:

      enable: true

    cancel:

      enable: true

#==============================================================================

# Advanced

#==============================================================================

io:

tmp:

    dirs: /tmp

classloader:

resolve:

    order：parent-first

    #order: child-first

#==============================================================================

# Flink Cluster Security Configuration

#==============================================================================

# Kerberos authentication for various components - Hadoop, ZooKeeper, and connectors -

# may be enabled in four steps:

# 1. configure the local krb5.conf file

# 2. provide Kerberos credentials (either a keytab or a ticket cache w/ kinit)

# 3. make the credentials available to various JAAS login contexts

# 4. configure the connector to use JAAS/SASL

# # The below configure how Kerberos credentials are provided. A keytab will be used instead of

# # a ticket cache if the keytab path and principal are set.

# security:

#   kerberos:

#     login:

#       use-ticket-cache: true

#       keytab: /path/to/kerberos/keytab

#       principal: flink-user

#       # The configuration below defines which JAAS login contexts

#       contexts: Client,KafkaClient

#==============================================================================

# ZK Security Configuration

#==============================================================================

# zookeeper:

#   sasl:

#     # Below configurations are applicable if ZK ensemble is configured for security

#     #

#     # Override below configuration to provide custom ZK service name if configured

#     # zookeeper.sasl.service-name: zookeeper

#     #

#     # The configuration below must match one of the values set in "security.kerberos.login.contexts"

#     login-context-name: Client

#==============================================================================

# HistoryServer

#==============================================================================

historyserver:

web:

    address: 0.0.0.0

    port: 8082

archive:

    fs:

      dir: hdfs://10.10.10.99:9000/flink/historyserver/completed-jobs/

      fs.refresh-interval: 10000