madlib2.0简介
Apache MADlib
madlib使用指南
MADlib: Main Page
madlib安装
Installation Guide - Apache MADlib - Apache Software Foundation
准备
- cmake 3.5.2及以上版本
- python3.9
- postgresql15,源码编译时必须指定 –with-python
- postgresql插件plpython3u
源码安装cmake3.16.2
cd /opt
wget https://github.com/Kitware/CMake/releases/download/v3.16.2/cmake-3.16.2.tar.gz
tar -zxvf cmake-3.16.2.tar.gz
cd cmake-3.16.2
./configure --prefix=/usr/local/cmake
make -j 4
make install
echo 'export CMAKE_HOME=/usr/local/cmake
export PATH=$CMAKE_HOME/bin:$PATH' >> /etc/profile
source /etc/profile
postgresql15.2源码安装
Centos7.6安装postgresql15
su - postgres
psql etl -U postgres
--指定库建立插件plpython3u
create extension plpython3u;
--创建用户madlib作为超级管理员
create role madlib with superuser login password '000000';
--默认公开由madlib创建的表的查询权限
alter default privileges for role madlib grant select on tables to public;
\q
exit
madlib2.0源码安装
cd /opt
# 上传 apache-madlib-2.0.0-src.tar.gz
# 下载地址:https://dist.apache.org/repos/dist/release/madlib/2.0.0/apache-madlib-2.0.0-src.tar.gz
rz # apache-madlib-2.0.0-src.tar.gz
# 解压
tar -zxvf /opt/apache-madlib-2.0.0-src.tar.gz -C /usr/local/src
# 环境变量
# MADLIB_ROOT : 源码目录
# MADLIB_BUILD_ROOT : 构建目录
echo 'export MADLIB_ROOT=/usr/local/src/apache-madlib-2.0.0-src
export MADLIB_BUILD_ROOT=/usr/local/src/apache-madlib-2.0.0-src/build' >> /etc/profile
source /etc/profile
# 创建build目录
mkdir $MADLIB_ROOT/build
# 切到目录 $MADLIB_BUILD_ROOT
cd $MADLIB_BUILD_ROOT
# 配置
cmake ..
# 编译
# 会在线下载一些文件,所以比较慢,注意,不要用并行
make
# 会发现编译报错
/usr/local/src/apache-madlib-2.0.0-src/methods/array_ops/src/pg_gp/array_ops.c:11:24: fatal error: utils/int8.h: No such file or directory
#include "utils/int8.h"
^
compilation terminated.
make[2]: *** [src/ports/postgres/15/CMakeFiles/madlib_postgresql_15.dir/__/__/__/__/methods/array_ops/src/pg_gp/array_ops.c.o] Error 1
make[1]: *** [src/ports/postgres/15/CMakeFiles/madlib_postgresql_15.dir/all] Error 2
make: *** [all] Error 2
# 修改源码,将第11行的 #include "utils/int8.h" 删除
sed -i 's@#include "utils/int8.h"@@g' /usr/local/src/apache-madlib-2.0.0-src/methods/array_ops/src/pg_gp/array_ops.c
# 然后重新编译
make
# 又报另外一个错
# 函数调用错误,实际pg15中的pg_md5_hash有4个参数,单数源码中调用只用了3个参数
/usr/local/src/apache-madlib-2.0.0-src/methods/sketch/src/pg_gp/sketch_support.c: In function ‘sketch_md5_bytea’:
/usr/local/src/apache-madlib-2.0.0-src/methods/sketch/src/pg_gp/sketch_support.c:322:9: error: too few arguments to function ‘pg_md5_hash’
pg_md5_hash(datp, len, outbuf);
^
In file included from /usr/local/src/apache-madlib-2.0.0-src/methods/sketch/src/pg_gp/sketch_support.c:46:0:
/usr/local/pgsql/include/server/common/md5.h:29:13: note: declared here
extern bool pg_md5_hash(const void *buff, size_t len, char *hexsum,
^
make[2]: *** [src/ports/postgres/15/CMakeFiles/madlib_postgresql_15.dir/__/__/__/__/methods/sketch/src/pg_gp/sketch_support.c.o] Error 1
make[1]: *** [src/ports/postgres/15/CMakeFiles/madlib_postgresql_15.dir/all] Error 2
make: *** [all] Error 2
# 定位到321行的else,如下图操作
vi /usr/local/src/apache-madlib-2.0.0-src/methods/sketch/src/pg_gp/sketch_support.c
# 增加
const char *errstr = NULL;
# 修改
pg_md5_hash(datp, len, outbuf, &errstr);
# 再次编译,此时已经编译成功
make
# 将源码目录授权给postgres用户
chown -R postgres.postgres $MADLIB_ROOT
# 切换到postgres用户
su - postgres
# 开始安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 报错
m4:/usr/local/src/apache-madlib-2.0.0-src/src/ports/postgres/modules/validation/cross_validation.sql_in:14: cannot open `SQLCommon.m4': No such file or directory
madpack.py: INFO : Installing MADlib:
madpack.py: ERROR : Failed executing /tmp/madlib.Niatpe/madlib_install.sql
madpack.py: ERROR : Check the log at /tmp/madlib.Niatpe/madlib_install.sql.log
madpack.py: INFO : MADlib install unsuccessful.
madpack.py: INFO : All changes are rolled back.
INFO: Log files saved in /tmp/madlib.Niatpe
# 查找文件 SQLCommon.m4
find $MADLIB_ROOT -name SQLCommon.m4
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/madpack/SQLCommon.m4
# 切到该目录
cd /usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/madpack/
# 重新安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 依然报错
madpack.py: INFO : Detected PostgreSQL version 15.2.
madpack.py: INFO : *** Installing MADlib ***
madpack.py: INFO : MADlib tools version = 2.0.0 (/usr/local/src/apache-madlib-2.0.0-src/src/bin/../madpack/madpack.py)
madpack.py: INFO : MADlib database version = None (host=localhost:5432, db=etl, schema=madlib)
madpack.py: INFO : Testing PL/Python environment...
madpack.py: INFO : > PL/Python environment OK (version: 3.9.17)
madpack.py: INFO : > Preparing objects for the following modules:
madpack.py: INFO : > - bayes
madpack.py: INFO : > - crf
madpack.py: INFO : > - elastic_net
madpack.py: INFO : > - linalg
madpack.py: INFO : > - pmml
madpack.py: INFO : > - prob
madpack.py: INFO : > - svm
madpack.py: INFO : > - tsa
madpack.py: INFO : > - conjugate_gradient
madpack.py: INFO : > - knn
madpack.py: INFO : > - lda
madpack.py: INFO : > - stats
madpack.py: INFO : > - utilities
madpack.py: INFO : > - assoc_rules
madpack.py: INFO : > - convex
madpack.py: INFO : > - dbscan
madpack.py: INFO : > - deep_learning
madpack.py: INFO : > - glm
madpack.py: INFO : > - graph
madpack.py: INFO : > - linear_systems
madpack.py: INFO : > - mxgboost
madpack.py: INFO : > - recursive_partitioning
madpack.py: INFO : > - regress
madpack.py: INFO : > - sample
madpack.py: INFO : > - summary
madpack.py: INFO : > - kmeans
madpack.py: INFO : > - pca
madpack.py: INFO : > - validation
madpack.py: INFO : Installing MADlib:
madpack.py: ERROR : Failed executing /tmp/madlib.QJOwWI/madlib_install.sql
madpack.py: ERROR : Check the log at /tmp/madlib.QJOwWI/madlib_install.sql.log
madpack.py: INFO : MADlib install unsuccessful.
madpack.py: INFO : All changes are rolled back.
INFO: Log files saved in /tmp/madlib.QJOwWI
# 查看日志
cat /tmp/madlib.QJOwWI/madlib_install.sql.log | tail -4
CREATE OR REPLACE FUNCTION madlib.vcrf_top1_label(mArray DOUBLE PRECISION[], rArray DOUBLE PRECISION[], nlabel int)
returns int[] as 'libmadlib.so' language c strict;
psql:/tmp/madlib.QJOwWI/madlib_install.sql:464: ERROR: could not access file "libmadlib.so": No such file or directory
# 查找该文件
find $MADLIB_ROOT -name libmadlib.so
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/lib/libmadlib.so
# 编辑pg的数据目录下的postgresql.auto.conf文件
vi $PGDATA/postgresql.auto.conf
dynamic_library_path = '/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/lib'
# 重启postgresql
pg_ctl restart -mf
# 再次重新安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 依然报错
madpack.py: INFO : Detected PostgreSQL version 15.2.
server signaled
madpack.py: INFO : *** Installing MADlib ***
madpack.py: INFO : MADlib tools version = 2.0.0 (/usr/local/src/apache-madlib-2.0.0-src/src/bin/../madpack/madpack.py)
madpack.py: INFO : MADlib database version = None (host=localhost:5432, db=etl, schema=madlib)
madpack.py: INFO : Testing PL/Python environment...
madpack.py: INFO : > PL/Python environment OK (version: 3.9.17)
madpack.py: INFO : > Preparing objects for the following modules:
madpack.py: INFO : > - bayes
madpack.py: INFO : > - crf
madpack.py: INFO : > - elastic_net
madpack.py: INFO : > - linalg
madpack.py: INFO : > - pmml
madpack.py: INFO : > - prob
madpack.py: INFO : > - svm
madpack.py: INFO : > - tsa
madpack.py: INFO : > - conjugate_gradient
madpack.py: INFO : > - knn
madpack.py: INFO : > - lda
madpack.py: INFO : > - stats
madpack.py: INFO : > - utilities
madpack.py: INFO : > - assoc_rules
madpack.py: INFO : > - convex
madpack.py: INFO : > - dbscan
madpack.py: INFO : > - deep_learning
madpack.py: INFO : > - glm
madpack.py: INFO : > - graph
madpack.py: INFO : > - linear_systems
madpack.py: INFO : > - mxgboost
madpack.py: INFO : > - recursive_partitioning
madpack.py: INFO : > - regress
madpack.py: INFO : > - sample
madpack.py: INFO : > - summary
madpack.py: INFO : > - kmeans
madpack.py: INFO : > - pca
madpack.py: INFO : > - validation
madpack.py: INFO : Installing MADlib:
madpack.py: ERROR : Failed executing /tmp/madlib.pSbRNZ/madlib_install.sql
madpack.py: ERROR : Check the log at /tmp/madlib.pSbRNZ/madlib_install.sql.log
madpack.py: INFO : MADlib install unsuccessful.
madpack.py: INFO : All changes are rolled back.
INFO: Log files saved in /tmp/madlib.pSbRNZ
# 查看日志 报错:函数不存在
cat /tmp/madlib.pSbRNZ/madlib_install.sql.log | tail -5
madlib.array_union(anyarray) (
SFUNC = array_cat,
STYPE = anyarray
);
psql:/tmp/madlib.pSbRNZ/madlib_install.sql:571: ERROR: function array_cat(anyarray, anyarray) does not exist
# 自己建函数 array_cat(anyarray, anyarray)
psql etl -U madlib
CREATE OR REPLACE FUNCTION public.array_cat(anyarray, anyarray)
RETURNS anyarray
LANGUAGE plpgsql
SECURITY DEFINER
AS $function$
begin
return array_cat($1,$2);
end;
$function$
;
\q
# 再次安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 依然报错
madpack.py: INFO : Detected PostgreSQL version 15.2.
madpack.py: INFO : *** Installing MADlib ***
madpack.py: INFO : MADlib tools version = 2.0.0 (/usr/local/src/apache-madlib-2.0.0-src/src/bin/../madpack/madpack.py)
madpack.py: INFO : MADlib database version = None (host=localhost:5432, db=etl, schema=madlib)
madpack.py: INFO : Testing PL/Python environment...
madpack.py: INFO : > PL/Python environment OK (version: 3.9.17)
madpack.py: INFO : > Preparing objects for the following modules:
madpack.py: INFO : > - bayes
madpack.py: INFO : > - crf
madpack.py: INFO : > - elastic_net
madpack.py: INFO : > - linalg
madpack.py: INFO : > - pmml
madpack.py: INFO : > - prob
madpack.py: INFO : > - svm
madpack.py: INFO : > - tsa
madpack.py: INFO : > - conjugate_gradient
madpack.py: INFO : > - knn
madpack.py: INFO : > - lda
madpack.py: INFO : > - stats
madpack.py: INFO : > - utilities
madpack.py: INFO : > - assoc_rules
madpack.py: INFO : > - convex
madpack.py: INFO : > - dbscan
madpack.py: INFO : > - deep_learning
madpack.py: INFO : > - glm
madpack.py: INFO : > - graph
madpack.py: INFO : > - linear_systems
madpack.py: INFO : > - mxgboost
madpack.py: INFO : > - recursive_partitioning
madpack.py: INFO : > - regress
madpack.py: INFO : > - sample
madpack.py: INFO : > - summary
madpack.py: INFO : > - kmeans
madpack.py: INFO : > - pca
madpack.py: INFO : > - validation
madpack.py: INFO : Installing MADlib:
madpack.py: ERROR : Failed executing /tmp/madlib.IB61Bf/madlib_install.sql
madpack.py: ERROR : Check the log at /tmp/madlib.IB61Bf/madlib_install.sql.log
madpack.py: INFO : MADlib install unsuccessful.
madpack.py: INFO : All changes are rolled back.
INFO: Log files saved in /tmp/madlib.IB61Bf
# 查看日志 报错:函数不存在
cat /tmp/madlib.IB61Bf/madlib_install.sql.log | tail -7
CREATE AGGREGATE madlib.agg_array_concat(REAL[]) (
SFUNC = array_cat,
PREFUNC = array_cat,
STYPE = REAL[]
);
psql:/tmp/madlib.IB61Bf/madlib_install.sql:15416: WARNING: aggregate attribute "prefunc" not recognized
psql:/tmp/madlib.IB61Bf/madlib_install.sql:15416: ERROR: function array_cat(real[], real[]) does not exist
# 自己建函数 array_cat(real[], real[])
psql etl -U madlib
CREATE OR REPLACE FUNCTION public.array_cat(real[], real[])
RETURNS real[]
LANGUAGE plpgsql
SECURITY DEFINER
AS $function$
begin
return array_cat($1,$2);
end;
$function$
;
\q
# 再次安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 依然报错
madpack.py: INFO : Detected PostgreSQL version 15.2.
madpack.py: INFO : *** Installing MADlib ***
madpack.py: INFO : MADlib tools version = 2.0.0 (/usr/local/src/apache-madlib-2.0.0-src/src/bin/../madpack/madpack.py)
madpack.py: INFO : MADlib database version = None (host=localhost:5432, db=etl, schema=madlib)
madpack.py: INFO : Testing PL/Python environment...
madpack.py: INFO : > PL/Python environment OK (version: 3.9.17)
madpack.py: INFO : > Preparing objects for the following modules:
madpack.py: INFO : > - bayes
madpack.py: INFO : > - crf
madpack.py: INFO : > - elastic_net
madpack.py: INFO : > - linalg
madpack.py: INFO : > - pmml
madpack.py: INFO : > - prob
madpack.py: INFO : > - svm
madpack.py: INFO : > - tsa
madpack.py: INFO : > - conjugate_gradient
madpack.py: INFO : > - knn
madpack.py: INFO : > - lda
madpack.py: INFO : > - stats
madpack.py: INFO : > - utilities
madpack.py: INFO : > - assoc_rules
madpack.py: INFO : > - convex
madpack.py: INFO : > - dbscan
madpack.py: INFO : > - deep_learning
madpack.py: INFO : > - glm
madpack.py: INFO : > - graph
madpack.py: INFO : > - linear_systems
madpack.py: INFO : > - mxgboost
madpack.py: INFO : > - recursive_partitioning
madpack.py: INFO : > - regress
madpack.py: INFO : > - sample
madpack.py: INFO : > - summary
madpack.py: INFO : > - kmeans
madpack.py: INFO : > - pca
madpack.py: INFO : > - validation
madpack.py: INFO : Installing MADlib:
madpack.py: ERROR : Failed executing /tmp/madlib.rjqeeK/madlib_install.sql
madpack.py: ERROR : Check the log at /tmp/madlib.rjqeeK/madlib_install.sql.log
madpack.py: INFO : MADlib install unsuccessful.
madpack.py: INFO : All changes are rolled back.
INFO: Log files saved in /tmp/madlib.rjqeeK
# 查看日志 报错:函数不存在
cat /tmp/madlib.rjqeeK/madlib_install.sql.log | tail -7
CREATE AGGREGATE madlib.agg_array_concat(SMALLINT[]) (
SFUNC = array_cat,
PREFUNC = array_cat,
STYPE = SMALLINT[]
);
psql:/tmp/madlib.rjqeeK/madlib_install.sql:15423: WARNING: aggregate attribute "prefunc" not recognized
psql:/tmp/madlib.rjqeeK/madlib_install.sql:15423: ERROR: function array_cat(smallint[], smallint[]) does not exist
# 自己建函数 array_cat(real[], real[])
psql etl -U madlib
CREATE OR REPLACE FUNCTION public.array_cat(smallint[], smallint[])
RETURNS smallint[]
LANGUAGE plpgsql
SECURITY DEFINER
AS $function$
begin
return array_cat($1,$2);
end;
$function$
;
\q
# 再次安装
$MADLIB_ROOT/src/bin/madpack -s madlib -c madlib/000000@localhost:5432/etl install --platform postgres
# 安装成功
madpack.py: INFO : Detected PostgreSQL version 15.2.
madpack.py: INFO : *** Installing MADlib ***
madpack.py: INFO : MADlib tools version = 2.0.0 (/usr/local/src/apache-madlib-2.0.0-src/src/bin/../madpack/madpack.py)
madpack.py: INFO : MADlib database version = None (host=localhost:5432, db=etl, schema=madlib)
madpack.py: INFO : Testing PL/Python environment...
madpack.py: INFO : > PL/Python environment OK (version: 3.9.17)
madpack.py: INFO : > Preparing objects for the following modules:
madpack.py: INFO : > - bayes
madpack.py: INFO : > - crf
madpack.py: INFO : > - elastic_net
madpack.py: INFO : > - linalg
madpack.py: INFO : > - pmml
madpack.py: INFO : > - prob
madpack.py: INFO : > - svm
madpack.py: INFO : > - tsa
madpack.py: INFO : > - conjugate_gradient
madpack.py: INFO : > - knn
madpack.py: INFO : > - lda
madpack.py: INFO : > - stats
madpack.py: INFO : > - utilities
madpack.py: INFO : > - assoc_rules
madpack.py: INFO : > - convex
madpack.py: INFO : > - dbscan
madpack.py: INFO : > - deep_learning
madpack.py: INFO : > - glm
madpack.py: INFO : > - graph
madpack.py: INFO : > - linear_systems
madpack.py: INFO : > - mxgboost
madpack.py: INFO : > - recursive_partitioning
madpack.py: INFO : > - regress
madpack.py: INFO : > - sample
madpack.py: INFO : > - summary
madpack.py: INFO : > - kmeans
madpack.py: INFO : > - pca
madpack.py: INFO : > - validation
madpack.py: INFO : Installing MADlib:
madpack.py: INFO : > Created madlib schema
madpack.py: INFO : > Created madlib.MigrationHistory table
madpack.py: INFO : > Wrote version info in MigrationHistory table
madpack.py: INFO : MADlib 2.0.0 installed successfully in madlib schema.
# 如果依然报错函数错误,也可能是没有函数array_union(pg_catalog.anyarray)
psql etl -U madlib
CREATE OR REPLACE AGGREGATE public.array_union(pg_catalog.anyarray) (
SFUNC = public.array_cat,
STYPE = anyarray
);
\q
# 权限
psql etl -U postgres
--回收madlib的超级用户权限
alter role madlib with nosuperuser;
--授权madlib用户madlib模式的所有权限
grant all on schema madlib to madlib;
--授权public用户组madlib模式的使用权限
grant usage on schema madlib to public;
--回收madlib在public模式上函数的执行权限,也就是的函数得显式授权
alter default privileges for role madlib revoke execute on functions from public;
--回收在madlib模式上的public的执行权限,也就是madlib模式下的函数得显式授权才能使用
alter default privileges in schema madlib revoke execute on functions from public;
--默认公开在模式madlib下表的查询权限
alter default privileges in schema madlib grant select on tables to public;
测试
DROP TABLE IF EXISTS test;
CREATE TABLE test(
id1 INTEGER,
id2 INTEGER,
gr1 INTEGER,
gr2 INTEGER
);
INSERT INTO test VALUES
(1,0,1,1),
(2,0,1,1),
(3,0,1,1),
(4,0,1,1),
(5,0,1,1),
(6,0,1,1),
(7,0,1,1),
(8,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(9,0,1,1),
(0,1,1,2),
(0,2,1,2),
(0,3,1,2),
(0,4,1,2),
(0,5,1,2),
(0,6,1,2),
(10,10,2,2),
(20,20,2,2),
(30,30,2,2),
(40,40,2,2),
(50,50,2,2),
(60,60,2,2),
(70,70,2,2);
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
false
); -- Sample without replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;
- 报错:ERROR: ImportError: cannot import name ‘stratified_sample’ from ‘sample’ (unknown location)
- 查看源码
# 发现/usr/local/src/apache-madlib-2.0.0-src/src/lib/python目录都不存在
cd /usr/local/src/apache-madlib-2.0.0-src/src/lib/python
-bash: cd: /usr/local/src/apache-madlib-2.0.0-src/src/lib/python: No such file or directory
# 发现不是py文件
ls /usr/local/src/apache-madlib-2.0.0-src/src/modules/sample
random_process.cpp random_process.hpp sample.hpp weighted_sample.cpp weighted_sample.hpp WeightedSample_impl.hpp WeightedSample_proto.hpp
find / -name python
find $MADLIB_ROOT -name sample
/usr/local/src/apache-madlib-2.0.0-src/build/third_party/src/EP_boost/libs/geometry/doc/src/docutils/tools/doxygen_xml2qbk/sample
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/greenplum/modules/sample
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/CMakeFiles/madlib_postgresql_15.dir/__/__/__/modules/sample
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/modules/sample
/usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/modules/sample
/usr/local/src/apache-madlib-2.0.0-src/src/ports/postgres/modules/sample
/usr/local/src/apache-madlib-2.0.0-src/src/modules/sample
su - postgres
mkdir -p /usr/local/src/apache-madlib-2.0.0-src/src/lib
# 创建软连接
ln -s /export/anaconda3/lib/python3.9 /usr/local/src/apache-madlib-2.0.0-src/src/lib/python
# 重命名
mv /usr/local/src/apache-madlib-2.0.0-src/src/modules /usr/local/src/apache-madlib-2.0.0-src/src/modules_bak
# 创建软连接
ln -s /usr/local/src/apache-madlib-2.0.0-src/build/src/ports/postgres/15/modules /usr/local/src/apache-madlib-2.0.0-src/src/modules
- 重新测试
DROP TABLE IF EXISTS out;
SELECT madlib.stratified_sample(
'test', -- Source table
'out', -- Output table
0.5, -- Sample proportion
'gr1,gr2', -- Strata definition
'id1,id2', -- Columns to output
false
); -- Sample without replacement
SELECT * FROM out ORDER BY gr1,gr2,id1,id2;