lammps编译(2Aug2023、intel2020、rtx4070ti)

news2025/1/23 11:22:49

说明:

[root@node101 ~]# cat /etc/redhat-release
CentOS Linux release 7.9.2009 (Core)
[root@node101 ~]# gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-44) (GCC)
[root@node101 ~]# which mpirun
/opt/gpuApp/ompi/bin/mpirun
[root@node101 ~]# which icc
/opt/intel/compilers_and_libraries_2020.1.211/linux/bin/intel64/icc
[root@node101 ~]# which nvcc
/usr/local/cuda-12.3/bin/nvcc
[root@node101 ~]# lscpu
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                32
On-line CPU(s) list:   0-31
Thread(s) per core:    2
Core(s) per socket:    16
Socket(s):             1
NUMA node(s):          1
Vendor ID:             AuthenticAMD
CPU family:            23
Model:                 49
Model name:            AMD EPYC 7302 16-Core Processor
Stepping:              0
CPU MHz:               1500.000
CPU max MHz:           3000.0000
CPU min MHz:           1500.0000
BogoMIPS:              6000.34
Virtualization:        AMD-V
L1d cache:             32K
L1i cache:             32K
L2 cache:              512K
L3 cache:              16384K
NUMA node0 CPU(s):     0-31
Flags:                 fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc art rep_good nopl nonstop_tsc extd_apicid aperfmperf eagerfpu pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_l2 cpb cat_l3 cdp_l3 hw_pstate sme retpoline_amd ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip overflow_recov succor smca
[root@node101 ~]# free -g
              total        used        free      shared  buff/cache   available
Mem:            251           5         227           0          18         244
Swap:           127           0         127
[root@node101 ~]#
 

lammps支持单精度,也支持双精度。受限于4070Ti,其双精度能力很差,故本次使用单精度方式进行使用。

显卡的SM值可以通过cuda自带的工具查询:

[root@node101 tools]#ls /usr/local/cuda/samples/1_Utilities/deviceQuery

deviceQuery deviceQuery.cpp deviceQuery.o Makefile NsightEclipse.xml readme.txt

[root@node101 tools]#cd /usr/local/cuda/samples/1_Utilities/deviceQuery

[root@node101 deviceQuery]#./deviceQuery

1、环境文件

cat << EOF > ~/lammps-gpu-env.sh

#!/bin/bash

source /opt/intel/compilers_and_libraries_2020/linux/bin/compilervars.sh intel64

export PATH=/usr/local/cuda-12.3/bin:$PATH

export LD_LIBRARY_PATH=/usr/local/cuda-12.3/targets/x86_64-linux/lib:$LD_LIBRARY_PATH

export C_INCLUDE_PATH=/usr/local/cuda-12.3/targets/x86_64-linux/include:$C_INCLUDE_PATH

EOF

2、gdrcopy

cd gpu-lammps/

tar -zxvf gdrcopy-2.0.tar.gz

cd gdrcopy-2.0/

mkdir -p /opt/gpuApp/gdrcopy/include

mkdir -p /opt/gpuApp/gdrcopy/lib64

make PREFIX=/opt/gpuApp/gdrcopy lib lib_install

cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/gdrcopy/include:\$PATH

export CPATH=/opt/gpuApp/gdrcopy/include:\$CPATH

export LD_LIBRARY_PATH=/opt/gpuApp/gdrcopy/lib64:\$LD_LIBRARY_PATH

EOF

3、ucx

cd ~/gpu-lammps/

tar -zxvf ucx-1.7.0.tar.gz

cd ucx-1.7.0/

./configure --prefix=/opt/gpuApp/ucx --enable-optimizations --disable-logging --disable-debug --disable-assertions --disable-params-check --disable-doxygen-doc --with-cuda=/usr/local/cuda --with-gdrcopy=/opt/gpuApp/gdrcopy/ --with-verbs --with-rdmacm

……………………………….

configure: =========================================================

configure: UCX build configuration:

configure: Preprocessor flags:   -DCPU_FLAGS="|avx" -I${abs_top_srcdir}/src -I${abs_top_builddir} -I${abs_top_builddir}/src

configure:            C flags:   -O3 -g -Wall -Werror -mavx

configure:          C++ flags:   -O3 -g -Wall -Werror -mavx

configure:       Multi-thread:   Disabled

configure:          MPI tests:   Disabled

configure:      Devel headers:

configure:        UCT modules:   < cuda ib rdmacm cma >

configure:       CUDA modules:   < gdrcopy >

configure:       ROCM modules:   < >

configure:         IB modules:   < >

configure:        UCM modules:   < cuda >

configure:       Perf modules:   < cuda >

configure: =========================================================

…………..

cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/ucx/bin:\$PATH

export LD_LIBRARY_PATH=/opt/gpuApp/ucx/lib:\$LD_LIBRARY_PATH

EOF

4、openmpi

[root@node101 gpu-lammps]# cd ~/gpu-lammps/

[root@node101 gpu-lammps]# tar -xvf openmpi-4.1.6.tar

[root@node101 gpu-lammps]# cd openmpi-4.1.6/

[root@node101 openmpi-4.1.6]# ./configure --prefix=/opt/gpuApp/ompi --enable-mpirun-prefix-by-default --enable-cuda --enable-dlopen --enable-weak-symbols --enable-heterogeneous --enable-binaries --enable-script-wrapper-compilers --enable-orterun-prefix-by-default --enable-mca-no-build=btl-uct --with-cuda --with-pmix --with-verbs --with-ucx=/opt/gpuApp/ucx

…………

Open MPI configuration:

-----------------------

Version: 4.1.6

Build MPI C bindings: yes

Build MPI C++ bindings (deprecated): no

Build MPI Fortran bindings: mpif.h, use mpi

MPI Build Java bindings (experimental): no

Build Open SHMEM support: yes

Debug build: no

Platform file: (none)

Miscellaneous

-----------------------

CUDA support: yes

HWLOC support: internal

Libevent support: internal

Open UCC: no

PMIx support: Internal

Transports

-----------------------

Cisco usNIC: no

Cray uGNI (Gemini/Aries): no

Intel Omnipath (PSM2): no

Intel TrueScale (PSM): no

Mellanox MXM: no

Open UCX: yes

OpenFabrics OFI Libfabric: no

OpenFabrics Verbs: yes

Portals4: no

Shared memory/copy in+copy out: yes

Shared memory/Linux CMA: yes

Shared memory/Linux KNEM: no

Shared memory/XPMEM: no

TCP: yes

Resource Managers

-----------------------

Cray Alps: no

Grid Engine: no

LSF: no

Moab: no

Slurm: yes

ssh/rsh: yes

Torque: no

OMPIO File Systems

-----------------------

DDN Infinite Memory Engine: no

Generic Unix FS: yes

IBM Spectrum Scale/GPFS: no

Lustre: no

PVFS2/OrangeFS: no

[root@node101 openmpi-4.1.6]# make -j 32

[root@node101 openmpi-4.1.6]# make install

[root@node101 openmpi-4.1.6]# cat << EOF >> ~/lammps-gpu-env.sh

export PATH=/opt/gpuApp/ompi/bin:\$PATH

export LD_LIBRARY_PATH=/opt/gpuApp/ompi/lib:\$LD_LIBRARY_PATH

export INCLUDE=/opt/gpuApp/ompi/include:\$INCLUDE

EOF

[root@node101 openmpi-4.1.6]#

5、lammps-cpu

[root@node101 gpu-lammps]# tar -zxvf lammps-2Aug2023.tar.gz

[root@node101 gpu-lammps]# cd lammps-2Aug2023/src

[root@node101 src]#source ~/lammps-gpu-env.sh

[root@node101 src]# make yes-all

[root@node101 src]# make no-lib

[root@node101 src]# cp MAKE/OPTIONS/Makefile.intel_cpu_openmpi MAKE/Makefile.intel

[root@node101 src]# make -j 32 intel

[root@node101 src]# cp lmp_intel lmp_intel_cpu

6、lammps-gpu

[root@node101 gpu-lammps]# cd lammps-2Aug2023/lib/gpu/

[root@node101 gpu]#source ~/lammps-gpu-env.sh

[root@node101 gpu]# vi Makefile.linux                         ##修改SM和CUDA_PRECISION[强撞1] 

[root@node101 gpu]# make -f Makefile.linux               ##编译GPU库

[root@node101 gpu]# ./nvc_get_devices

Found 1 platform(s).

CUDA Driver Version:                           12.30

Device 0: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 1: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 2: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

Device 3: "NVIDIA GeForce RTX 4070 Ti"

  Type of device:                                GPU

  Compute capability:                            8.9

  Double precision support:                      Yes

  Total amount of global memory:                 11.7281 GB

  Number of compute units/multiprocessors:       60

  Number of cores:                               11520

  Total amount of constant memory:               65536 bytes

  Total amount of local/shared memory per block: 49152 bytes

  Total number of registers available per block: 65536

  Warp size:                                     32

  Maximum number of threads per block:           1024

  Maximum group size (# of threads per block)    1024 x 1024 x 64

  Maximum item sizes (# threads for each dim)    2147483647 x 65535 x 65535

  Maximum memory pitch:                          2147483647 bytes

  Texture alignment:                             512 bytes

  Clock rate:                                    2.61 GHz

  Run time limit on kernels:                     No

  Integrated:                                    No

  Support host page-locked memory mapping:       Yes

  Compute mode:                                  Default

  Concurrent kernel execution:                   Yes

  Device has ECC support enabled:                No

[root@node101 gpu]# cd ../../src

[root@node101 src]#make package-status

[root@node101 src]#make yes-gpu

[root@node101 src]#make no-amoeba

[root@node101 src]#make clean-all

[root@node101 src]#make clean-machine

[root@node101 src]#make clean-intel

[root@node101 src]#make -j 32 intel

[root@node101 src]#cp lmp_intel lmp_intel_gpu

7、测试

7.1cpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 12 /opt/gpuApp/lammps/lmp_intel_cpu -in in.NHO

7.2 4core_1gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 4 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 1 -in in.NHO

GPU状态:

7.3 16core_1gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 16 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 1 -in in.NHO

GPU状态:

7.4 16core_4gpu

source /opt/gpuApp/lammps-gpu-env.sh

mpirun -np 16 /opt/gpuApp/lammps/lmp_intel_cuda -sf gpu -pk gpu 4 -in in.NHO

GPU状态:


 [强撞1]4070Ti为安培架构,SM为86。双精度性能差,PRECISION为-D_SINGLE_SINGLE

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/1307330.html

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!

相关文章

双十二哪个牌子的电视盒子好用?测评员总结电视盒子品牌排行榜

每次电商大促我会分享好物推荐&#xff0c;本期我要盘点的数码产品是电视盒子&#xff0c;电视盒子的功能和重要性大家都懂&#xff0c;但电视盒子如何选就不太了解了&#xff0c;我根据今年20多次的测评结果整理了电视盒子品牌排行榜&#xff0c;看看哪个牌子的电视盒子好用吧…

beebox靶场A3 low级别 xss通关教程(二)

六&#xff1a;xss get型 eval 通过观察我们可以发现url地址中存在一个date函数 那我们可以试一下把后面的date()函数去掉&#xff0c;直接写入一个alert(555) 发现直接弹出一个框&#xff0c;证明有xss漏洞 七&#xff1a;xss href 直接进入页面会看到是get方法&#xff0c…

HarmonyOS4.0从零开始的开发教程12给您的应用添加弹窗

HarmonyOS&#xff08;十&#xff09;给您的应用添加弹窗 概述 在我们日常使用应用的时候&#xff0c;可能会进行一些敏感的操作&#xff0c;比如删除联系人&#xff0c;这时候我们给应用添加弹窗来提示用户是否需要执行该操作&#xff0c;如下图所示&#xff1a; 弹窗是一种…

【51单片机系列】直流电机使用

本文是关于直流电机使用的相关介绍。 文章目录 一、直流电机介绍二、ULN2003芯片介绍三、在proteus中仿真实现对电机的驱动 51单片机的应用中&#xff0c;电机控制方面的应用也很多。在学习直流电机(PWM)之前&#xff0c;先使用GPIO控制电机的正反转和停止。但不能直接使用GPIO…

飞天使-linux操作的一些技巧与知识点4-ansible常用的技巧,配置等

文章目录 ansible配置文件的优先级尝试开始进行操作ansible常用模块ansible 的playbook示例安装phpplaybook中变量的引用 ansible yum install -y ansible 测试是否可用 ansible localhost -m ping /etc/ansible/ansible.cfg &#xff1a;主配置文件&#xff0c;配置 ansible…

3个好用的桌面管理软件!点赞

大家是不是觉得自己的桌面有时候特别乱、上班的时候会影响自己的心情。尤其是一些大企业&#xff0c;干净整洁的桌面是领导必须的要求。 今天就为大家推荐3款好用的桌面管理软件&#xff0c;这其中有的是适合企业用的&#xff0c;有的是适合个人用的&#xff0c;大家可根据自己…

KaiwuDB × 国网山东综能 | 分布式储能云边端一体化项目建设

项目背景 济南韩家峪村首个高光伏渗透率台区示范项目因其所处地理位置拥有丰富的光照资源&#xff0c;该区域住户 80% 以上的屋顶都安装了光伏板。仅 2022 年全年&#xff0c;光伏发电总量达到了百万千瓦时。 大量分布式光伏并网&#xff0c;在输出清洁电力的同时&#xff0c…

全志XR806开发板RTOS环境搭建及问题

测评一 RTOS环境搭建及问题 按照官网文档https://xr806.docs.aw-ol.com/rtos/env/说明&#xff0c;RTOS先拉取SDK&#xff0c;由于XR806是Cortex-M33 Star内核然后指定gcc-arm-none-eabi工具链的位置&#xff0c;再编译再烧录&#xff0c;开发过程和ESP32有些许相似&#xff0c…

深入理解Dubbo-7.服务消费调用源码分析

&#x1f44f;作者简介&#xff1a;大家好&#xff0c;我是爱吃芝士的土豆倪&#xff0c;24届校招生Java选手&#xff0c;很高兴认识大家&#x1f4d5;系列专栏&#xff1a;Spring源码、JUC源码、Kafka原理、分布式技术原理&#x1f525;如果感觉博主的文章还不错的话&#xff…

媒介盒子:软文推广让你的品牌宣传更高效

软文推广在当今企业的宣传方式中具有至关重要的作用。随着互联网技术的不断发展和社交媒体的广泛使用&#xff0c;软文推广已经成为品牌和广告主的首选。如何在海量信息中脱颖而出&#xff0c;如何让内容在众多信息中获得更高的点击率与转化率&#xff0c;还需要借助软文推广的…

算法通关村第十八关-黄金挑战回溯困难问题

大家好我是苏麟 , 今天带来几道回溯比较困难的题 . 回溯有很多比较难的问题&#xff0c;这里我们看两个&#xff0c;整体来说这两个只是处理略复杂&#xff0c;还不是最难的问题 . 大纲 IP问题 IP问题 描述 : 有效 IP 地址 正好由四个整数&#xff08;每个整数位于 0 到 255 …

UE5 - ArchvizExplorer与Map Border Collection结合 - 实现电子围栏效果

插件地址&#xff1a; https://www.unrealengine.com/marketplace/zh-CN/product/archviz-explorer https://www.unrealengine.com/marketplace/zh-CN/product/map-border-collection ArchvizExplorer扩展&#xff1a; https://download.csdn.net/download/qq_17523181/8843305…

独热编码和词向量的简单理解

把单词用向量表示&#xff0c;是把深度神经网络语言模型引入自然语言处理领域的一个核心技术。想要让机器理解单词&#xff0c;就必须要把它变成一串数字&#xff08;向量&#xff09;。下面介绍的 One-Hot Encoding&#xff08;One-Hot 编码&#xff09;和 Word Embedding &am…

【LittleXi】2023 ICPC ECfinal 出线 官方数据 民间预测

【LittleXi】2023 ICPC ECfinal 出线 官方数据 民间预测 说明&#xff1a; 参考去年、前年上海大学&#xff0c;设置210出线队伍 对6场区域赛&#xff08;不含港澳&#xff09;走Z字,每个学校最多三支队伍出线 字符串问题&#xff0c;可能会有几个名额失真&#xff0c;比如南…

【unity】【WebRTC】从0开始创建一个Unity远程媒体流app-设置输入设备

【项目源码】 包括本篇需要的脚本都打包在项目源码中,可以通过下面链接下载: 【背景】 目前我们能投射到远端浏览器(或者任何其它Peer)的媒体流只有默认的MainCamera画面,其实我们还可以通过配置输入来传输操作输入信息,比如键鼠等。 【追加input processing组件】 …

PyCharm控制台堆栈乱码问题解决

目录 1、问题描述2、问题原因3、问题解决 1、问题描述 PyCharm环境都已经配置成了UTF-8编码&#xff0c;控制台打印中文也不会出现乱码&#xff0c;但报错堆栈信息中如果有中文会出现中文乱码&#xff1a; 这种该怎么解决呢&#xff1f; 2、问题原因 未将PyCharm编码环境与项目…

Redis - 主从集群下的主从复制原理

主从复制过程 数据同步演变过程 sync 同步 Redis 2.8 版本之前&#xff0c;首次通信成功后&#xff0c; slave 会向 master 发送 sync 数据同步请求。然后 master 就会将其所有数据全部发送给 slave &#xff0c;由 slave 保存到其本地的持久化文件中。这个过 程…

10个前端开发不容错过的工具网站

作为开发人员&#xff0c;我们经常寻找合适的工具和资源来帮助日常开发工作。但是很多好用的工具网站尤其是国外的网站很多人都错过了。 这里我整理了一份包含 10 个网站的列表&#xff0c;这些网站或许可以帮助到作为前端开发者的你。 1、MDN Web 文档 MDN文档无疑是 Web 开…

Echarts Y轴自定义设置图片

如图Y轴有文字和图片&#xff0c;1-3的图片不同&#xff0c;后面的是特定的css 样式&#xff1b;实现代码 yAxis: {type: category,inverse: true,boundaryGap: false,axisTick: { show: false }, // 是否展示标记点axisLine: { show: false },axisLabel: {// 坐标轴的标签// f…

C++ Qt开发:CheckBox多选框组件

Qt 是一个跨平台C图形界面开发库&#xff0c;利用Qt可以快速开发跨平台窗体应用程序&#xff0c;在Qt中我们可以通过拖拽的方式将不同组件放到指定的位置&#xff0c;实现图形化开发极大的方便了开发效率&#xff0c;本章将重点介绍CheckBox单行输入框组件的常用方法及灵活运用…