apache hudi 初见

news2025/1/7 22:05:30

git clone https://gitee.com/apache/Hudi.git

docker pull yml 文件里面的镜像

然后
docker-compose -f docker-compose_hadoop284_hive233_spark244.yml -p 6p6 up -d

然后就是
在这里插入图片描述

在这里插入代码片# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

version: "3.3"

services:

  namenode:
    image: apachehudi/hudi-hadoop_2.8.4-namenode:latest
    hostname: namenode
    container_name: namenode
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    ports:
      - "50070:50070"
      - "8020:8020"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    env_file:
      - ./hadoop.env
    healthcheck:
      test: ["CMD", "curl", "-f", "http://namenode:50070"]
      interval: 30s
      timeout: 10s
      retries: 3

  datanode1:
    image: apachehudi/hudi-hadoop_2.8.4-datanode:latest
    container_name: datanode1
    hostname: datanode1
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    env_file:
      - ./hadoop.env
    ports:
      - "50075:50075"
      - "50010:50010"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    links:
      - "namenode"
      - "historyserver"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://datanode1:50075"]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - namenode

  historyserver:
    image: apachehudi/hudi-hadoop_2.8.4-history:latest
    hostname: historyserver
    container_name: historyserver
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    depends_on:
      - "namenode"
    links:
      - "namenode"
    ports:
      - "58188:8188"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://historyserver:8188"]
      interval: 30s
      timeout: 10s
      retries: 3
    env_file:
      - ./hadoop.env
    volumes:
      - historyserver:/hadoop/yarn/timeline

  hive-metastore-postgresql:
    image: bde2020/hive-metastore-postgresql:2.3.0
    volumes:
      - hive-metastore-postgresql:/var/lib/postgresql
    hostname: hive-metastore-postgresql
    container_name: hive-metastore-postgresql

  hivemetastore:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
    hostname: hivemetastore
    container_name: hivemetastore
    links:
      - "hive-metastore-postgresql"
      - "namenode"
    env_file:
      - ./hadoop.env
    command: /opt/hive/bin/hive --service metastore
    environment:
      SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
    ports:
      - "9083:9083"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    healthcheck:
      test: ["CMD", "nc", "-z", "hivemetastore", "9083"]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - "hive-metastore-postgresql"
      - "namenode"

  hiveserver:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3:latest
    hostname: hiveserver
    container_name: hiveserver
    env_file:
      - ./hadoop.env
    environment:
      SERVICE_PRECONDITION: "hivemetastore:9083"
    ports:
      - "10001:10001"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    depends_on:
      - "hivemetastore"
    links:
      - "hivemetastore"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - /var/hoodie/ws

  sparkmaster:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:latest
    hostname: sparkmaster
    container_name: sparkmaster
    env_file:
      - ./hadoop.env
    ports:
      - "9090:9090"
      - "7077:7077"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - INIT_DAEMON_STEP=setup_spark
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"

  spark-worker-1:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:latest
    hostname: spark-worker-1
    container_name: spark-worker-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - "8081:8081"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"

  zookeeper:
    image: 'bitnami/zookeeper:3.4.12-r68'
    hostname: zookeeper
    container_name: zookeeper
    ports:
      - "2181:2181"
    environment:
      - ALLOW_ANONYMOUS_LOGIN=yes

  kafka:
    image: 'bitnami/kafka:2.0.0'
    hostname: kafkabroker
    container_name: kafkabroker
    ports:
      - "9092:9092"
    environment:
      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
      - ALLOW_PLAINTEXT_LISTENER=yes

  presto-coordinator-1:
    container_name: presto-coordinator-1
    hostname: presto-coordinator-1
    image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest
    ports:
      - "9190:9190"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - PRESTO_JVM_MAX_HEAP=512M
      - PRESTO_QUERY_MAX_MEMORY=1GB
      - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB
      - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB
      - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB
      - TERM=xterm
    links:
      - "hivemetastore"
    volumes:
      - /var/hoodie/ws
    command: coordinator

  presto-worker-1:
    container_name: presto-worker-1
    hostname: presto-worker-1
    image: apachehudi/hudi-hadoop_2.8.4-prestobase_0.271:latest
    depends_on: [ "presto-coordinator-1" ]
    environment:
      - PRESTO_JVM_MAX_HEAP=512M
      - PRESTO_QUERY_MAX_MEMORY=1GB
      - PRESTO_QUERY_MAX_MEMORY_PER_NODE=256MB
      - PRESTO_QUERY_MAX_TOTAL_MEMORY_PER_NODE=384MB
      - PRESTO_MEMORY_HEAP_HEADROOM_PER_NODE=100MB
      - TERM=xterm
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - /var/hoodie/ws
    command: worker

  trino-coordinator-1:
    container_name: trino-coordinator-1
    hostname: trino-coordinator-1
    image: apachehudi/hudi-hadoop_2.8.4-trinocoordinator_368:latest
    ports:
      - "9191:9191"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    links:
      - "hivemetastore"
    volumes:
      - /var/hoodie/ws
    command: http://trino-coordinator-1:9191 trino-coordinator-1

  trino-worker-1:
    container_name: trino-worker-1
    hostname: trino-worker-1
    image: apachehudi/hudi-hadoop_2.8.4-trinoworker_368:latest
    depends_on: [ "trino-coordinator-1" ]
    ports:
      - "8092:8092"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - /var/hoodie/ws
    command: http://trino-coordinator-1:9191 trino-worker-1

  graphite:
    container_name: graphite
    hostname: graphite
    image: graphiteapp/graphite-statsd
    ports:
      - 80:80
      - 2003-2004:2003-2004
      - 8126:8126

  adhoc-1:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
    hostname: adhoc-1
    container_name: adhoc-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - '4040:4040'
      # JVM debugging port (mapped to 5006 on the host)
      - "5006:5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
      - "presto-coordinator-1"
      - "trino-coordinator-1"
    volumes:
      - /var/hoodie/ws

  adhoc-2:
    image: apachehudi/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:latest
    hostname: adhoc-2
    container_name: adhoc-2
    env_file:
      - ./hadoop.env
    ports:
      # JVM debugging port (mapped to 5005 on the host)
      - "5005:5005"
    depends_on:
      - sparkmaster
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
      - "presto-coordinator-1"
      - "trino-coordinator-1"
    volumes:
      - /var/hoodie/ws

volumes:
  namenode:
  historyserver:
  hive-metastore-postgresql:

networks:
  default:
     name: hudi

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/740670.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

MySQL之InnoDB存储结构 | 京东物流技术团队

1 InnoDB存储引擎 InnoDB存储引擎最早由Innobase Oy公司开发(属第三方存储引擎)。从MySQL 5.5版本开始作为表的默认存储引擎。该存储引擎是第一个完整支持ACID事务的MySQL存储引擎,特点是行锁设计、支持MVCC、支持外键、提供一致性非锁定读&…

机器学习实战 | 股票价格预测项目(深度学习初级)

目录 简介技术流程1. 载入依赖包2. 读取数据集3. 从数据集中分析价格4. 对数据排序5. 数据标准化6. 创建、训练和保存LSTM网络7. 使用LSTM模型进行股票价格预测8. 可视化预测和实际结果 完整程序 简介 准备写个系列博客介绍机器学习实战中的部分公开项目。首先从初级项目开始。…

面试题之spring源码

IOC的底层原理 : Spring是如何循环依赖的。(三级缓存,提前曝光): 循环依赖的定义:循环依赖就是循环引用,也就是两个或两个以上bean对象互相持有对方,最终形成闭环,比如A依赖B,B依赖C…

文件夹加密软件怎么选?文件夹加密软件盘点

文件夹是电脑储存数据的重要工具,那么该如何保护文件夹的数据安全呢?使用合适的文件夹加密软件可能是最简单的方法。那么文件夹加密软件该怎么选呢? 文件夹加密超级大师 文件夹加密超级大师可以说是最全能的文件夹加密软件,它拥有…

自己编写chrome插件

1.首先你需要一个menifest.json文件 {"manifest_version": 3,"name": "My Extension","version": "2.0","action": {"default_popup": "popup.html","default_icon": "icon.…

winform弹出消息自动消失

winform弹出消息自动消失 弹出消息后,在指定时间毫秒后消失.消息中包含异常消息,自动一直展示,点击关闭显示; 效果如图 using System; using System.Collections.Generic; using System.Text; using System.Threading.Tasks;usi…

保姆级系列教程-玩转Fiddler抓包教程(1)-HTTP和HTTPS基础知识

1.简介 有的小伙伴或者童鞋们可能会好奇地问,不是讲解和分享抓包工具了怎么这里开始讲解HTTP和HTTPS协议了。这是因为你对HTTP协议越了解,你就能越掌握Fiddler的使用方法,反过来你越使用Fiddler,就越能帮助你了解HTTP协议。 Fid…

全网最全,项目管理工具大合集!

早上好,我是老原。 很久没给大家更新工具,本以为之前更新的也够大家用了,没想到还是有很多小友来私信老原好用工具。 关注我比较久的粉丝都知道,我提倡的工具在精不在多,更多的把精力放在自身上,还有啥不…

Vue使用百度地图API详细教程

Vue使用百度地图详细教程 先提供几个文档 Vue-Baidu-map文档:https://dafrok.github.io/vue-baidu-map/#/zh/index 百度地图JavaScript文档:https://lbsyun.baidu.com/index.php?titlejspopularGL 1、申请百度API密钥 控制台->应用管理->我的应…

博途字符串和FIFO编程应用(SCL源代码)

FIFO的其它介绍请参看下面文章链接: PLC堆栈(FIFO)操作之栈级联_三菱plc控制系统的堆栈的工作原理_RXXW_Dor的博客-CSDN博客这篇博文主要讲下各种缓存栈的级联,提供一个分析问题的扩展思路,这个级联什么时候适合在项目里使用需要具体分析。级联实现数据队列的一级级递推传送…

Redis 安装

目录 1、准备安装环境 2、上传安装文件 3、解压安装文件 4、进入安装目录 5、运行编译命令 6、前台启动 ​编辑7、后台启动 8、验证服务 9、关闭服务 10、开启启动 1、准备安装环境 由于 Redis 是基于 C 语言编写的,因此首先需要安装 Redis 所需要的依赖…

Python(一):为什么我们要学习Python?

❤️ 专栏简介:本专栏记录了我个人从零开始学习Python编程的过程。在这个专栏中,我将分享我在学习Python的过程中的学习笔记、学习路线以及各个知识点。 ☀️ 专栏适用人群 :本专栏适用于希望学习Python编程的初学者和有一定编程基础的人。无…

星辰天合受邀参加 2023 全球数字经济大会

7 月 4 日至 7 日,以“数据驱动发展,智能引领未来”为主题的2023全球数字经济大会在北京隆重举办。作为国内技术领先的数据基础设施提供商,星辰天合以北京优秀信创企业代表的身份,受北京信息化协会邀请,参加了 2023 全…

ICC2:copy block方法

open_lib new.nlib open_lib old_lib copy_block -from_block old_block -to_block new.nlib:old_block save_lib new.nlib close_lib 如果是从同一个lib下的block copy到同个lib里,那就open_lib后直接copy就好了,操作时用current_block new_name_b…

多元回归预测 | Matlab基于高斯过程回归(GPR)的数据回归预测,matlab代码,多变量输入模型

文章目录 效果一览文章概述部分源码参考资料效果一览 文章概述 多元回归预测 | Matlab基于高斯过程回归(GPR)的数据回归预测,matlab代码,多变量输入模型 评价指标包括:MAE、RMSE和R2等,代码质量极高,方便学习和替换数据。要求2018版本及以上。 部分源码

STM32 Proteus仿真LCD12864俄罗斯方块-FZ0063

STM32 Proteus仿真LCD12864俄罗斯方块-FZ0063 Proteus仿真小实验: STM32 Proteus仿真LCD12864俄罗斯方块-FZ0063 功能: 硬件组成:STM32F103R6单片机 LCD12864显示器多个按键 1.标准俄罗斯方块经典游戏玩法,带计时&#xff0c…

计数排序 (Counting Sort)_20230709

计数排序(Counting Sort) 前言 计数排序的对象一般为分布在[0-k]范围内的非负整数,计数器类似哈希函数的线性映射,它确定了数值本身和它在序列中的总数量之间的基本关系。它的本质是计算某个数在临时序列中(原序列大小相同,但下…

零售业未来如何破局?抓住数智化经营的两把利刃!

导语 | 数字化转型浪潮席卷了千行百业,有人从中看出了汹涌的挑战,也有人从中嗅出了美妙的商机。对于零售企业而言,当前数智经营进入了哪个阶段?未来的破局之道又在何方?我们邀请到了广东省 CIO 协会消费品与零售行业分…

API接口知识小结(电商API接入)

应用程序接口API(Application Programming Interface),是提供特定业务输出能力、连接不同系统的一种约定。这里包括外部系统与提供服务的系统(中后台系统)或后台不同系统之间的交互点。包括外部接口、内部接口&#xf…

Redis专题学习(一)Redis核心数据结构实战与高性能原理剖析

redis是key-value的存储格式, key是string类型的, value可以有五种基本的数据结构:string、hash、list、set、zset 来看看 这5中基本数据类型的基本使用和应用 一.字符串string string是最常见和最基本的数据结构 基本使用: …