环境:
[root@node101 DataX]# uname -a
Linux node101 4.4.219-1.el7.elrepo.x86_64 #1 SMP Sun Apr 12 16:13:06 EDT 2020 x86_64 x86_64 x86_64 GNU/Linux
[root@node101 DataX]# cat /etc/redhat-release
CentOS Linux release 7.6.1810 (Core)
[root@node101 DataX]# python
Python 2.7.5 (default, Oct 30 2018, 23:45:53)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
[root@node101 DataX]# rpm -qa | grep java
java-1.8.0-openjdk-devel-1.8.0.322.b06-1.el7_9.x86_64
python-javapackages-3.4.1-11.el7.noarch
tzdata-java-2021e-1.el7.noarch
javapackages-tools-3.4.1-11.el7.noarch
java-1.8.0-openjdk-headless-1.8.0.322.b06-1.el7_9.x86_64
java-1.8.0-openjdk-1.8.0.322.b06-1.el7_9.x86_64
安装JAVA和mvn包:
yum install java -y
wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo
yum -y install apache-maven
下载源代码:
DataX_source_code_home=/root/
cd /root/
git clone https://github.com/alibaba/DataX.git
通过maven打包:
更换mvn为阿里的源:
[root@node101 maven]# pwd
/etc/maven
[root@node101 maven]# cat settings.xml
在<morrors> </mirrors>中加入以下代码:
/etc/mvn/
<mirror>
<id>aliyunmaven</id>
<mirrorOf>*</mirrorOf>
<name>阿里云公共仓库</name>
<url>https://maven.aliyun.com/repository/public</url>
</mirror>
$ cd {DataX_source_code_home}
注释无法编译的组件:
cat pom.xml
60 <!-- <module>hdfsreader</module> -->
68 <!-- <module>tsdbreader</module> -->
81 <!-- <module>hdfswriter</module> -->
98 <!-- <module>tsdbwriter</module> -->
$ mvn -U clean package assembly:assembly -Dmaven.test.skip=true -X
打包成功,日志显示如下:
[INFO] datax-all .......................................... SUCCESS [04:43 min]
[INFO] datax-common ....................................... SUCCESS [ 7.217 s]
[INFO] datax-transformer .................................. SUCCESS [ 5.076 s]
[INFO] datax-core ......................................... SUCCESS [ 9.603 s]
[INFO] plugin-rdbms-util .................................. SUCCESS [ 3.718 s]
[INFO] mysqlreader ........................................ SUCCESS [ 2.609 s]
[INFO] drdsreader ......................................... SUCCESS [ 3.476 s]
[INFO] sqlserverreader .................................... SUCCESS [ 3.511 s]
[INFO] postgresqlreader ................................... SUCCESS [ 2.976 s]
[INFO] kingbaseesreader ................................... SUCCESS [ 2.635 s]
[INFO] oraclereader ....................................... SUCCESS [ 3.067 s]
[INFO] odpsreader ......................................... SUCCESS [ 4.741 s]
[INFO] otsreader .......................................... SUCCESS [ 4.627 s]
[INFO] otsstreamreader .................................... SUCCESS [ 4.596 s]
[INFO] plugin-unstructured-storage-util ................... SUCCESS [ 3.321 s]
[INFO] txtfilereader ...................................... SUCCESS [ 9.043 s]
[INFO] streamreader ....................................... SUCCESS [ 2.204 s]
[INFO] ossreader .......................................... SUCCESS [ 9.114 s]
[INFO] ftpreader .......................................... SUCCESS [ 8.767 s]
[INFO] mongodbreader ...................................... SUCCESS [ 7.671 s]
[INFO] rdbmsreader ........................................ SUCCESS [ 2.733 s]
[INFO] hbase11xreader ..................................... SUCCESS [ 13.827 s]
[INFO] hbase094xreader .................................... SUCCESS [ 8.680 s]
[INFO] opentsdbreader ..................................... SUCCESS [ 6.222 s]
[INFO] cassandrareader .................................... SUCCESS [ 4.187 s]
[INFO] gdbreader .......................................... SUCCESS [ 9.736 s]
[INFO] oceanbasev10reader ................................. SUCCESS [ 3.777 s]
[INFO] mysqlwriter ........................................ SUCCESS [ 2.152 s]
[INFO] tdenginewriter ..................................... SUCCESS [ 4.274 s]
[INFO] drdswriter ......................................... SUCCESS [ 2.347 s]
[INFO] odpswriter ......................................... SUCCESS [ 4.852 s]
[INFO] txtfilewriter ...................................... SUCCESS [ 7.167 s]
[INFO] ftpwriter .......................................... SUCCESS [ 8.369 s]
[INFO] streamwriter ....................................... SUCCESS [ 1.996 s]
[INFO] otswriter .......................................... SUCCESS [ 4.322 s]
[INFO] oraclewriter ....................................... SUCCESS [ 2.136 s]
[INFO] sqlserverwriter .................................... SUCCESS [ 2.002 s]
[INFO] postgresqlwriter ................................... SUCCESS [ 2.027 s]
[INFO] kingbaseeswriter ................................... SUCCESS [ 2.017 s]
[INFO] osswriter .......................................... SUCCESS [ 6.950 s]
[INFO] mongodbwriter ...................................... SUCCESS [ 6.970 s]
[INFO] adswriter .......................................... SUCCESS [ 6.641 s]
[INFO] ocswriter .......................................... SUCCESS [ 4.350 s]
[INFO] rdbmswriter ........................................ SUCCESS [ 2.312 s]
[INFO] hbase11xwriter ..................................... SUCCESS [ 13.488 s]
[INFO] hbase094xwriter .................................... SUCCESS [ 6.986 s]
[INFO] hbase11xsqlwriter .................................. SUCCESS [ 19.160 s]
[INFO] hbase11xsqlreader .................................. SUCCESS [ 19.933 s]
[INFO] elasticsearchwriter ................................ SUCCESS [ 3.980 s]
[INFO] adbpgwriter ........................................ SUCCESS [ 4.821 s]
[INFO] gdbwriter .......................................... SUCCESS [ 8.611 s]
[INFO] cassandrawriter .................................... SUCCESS [ 4.164 s]
[INFO] clickhousewriter ................................... SUCCESS [ 4.105 s]
[INFO] oscarwriter ........................................ SUCCESS [ 2.346 s]
[INFO] oceanbasev10writer ................................. SUCCESS [ 4.574 s]
[INFO] hbase20xsqlreader .................................. SUCCESS [ 3.002 s]
[INFO] hbase20xsqlwriter .................................. SUCCESS [ 2.440 s]
[INFO] kuduwriter ......................................... SUCCESS [ 2.997 s]
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 10:03 min
[INFO] Finished at: 2022-03-12T10:44:05+08:00
[INFO] Final Memory: 487M/1083M
[INFO] ------------------------------------------------------------------------
打包成功后的DataX包位于 {DataX_source_code_home}/target/datax/datax/ ,结构如下:
$ cd {DataX_source_code_home}
[root@node101 datax]# pwd
/root/DataX/target/datax/datax
[root@node101 datax]# ls
bin conf job lib plugin script tmp
将datax文件夹移到/usr/local/目录下:
[root@node101 datax]# pwd
/root/DataX/target/datax
[root@node101 datax]# ls
datax
[root@node101 datax]# cp -r datax /usr/local/
创建命令行链接:
[root@node101 datax]# chmod 755 /usr/local/datax -R
[root@node101 datax]# ln -sf /usr/local/datax/bin/datax.py /usr/bin/datax.py
[root@node101 datax]# ll /usr/bin/datax.py
lrwxrwxrwx 1 root root 29 3月 12 11:12 /usr/bin/datax.py -> /usr/local/datax/bin/datax.py
测试:
从Stream流读取数流打印到控制台:
查看官方提高的模板
[root@node01 datax]# python bin/datax.py -r streamreader -w streamwriter
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
Please refer to the streamreader document:
https://github.com/alibaba/DataX/blob/master/streamreader/doc/streamreader.md
Please refer to the streamwriter document:
https://github.com/alibaba/DataX/blob/master/streamwriter/doc/streamwriter.md,k
Please save the following configuration as a json file and use
python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json
to run the job.
{
"job": {
"content": [
{
"reader": {
"name": "streamreader",
"parameter": {
"column": [],
"sliceRecordCount": ""
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "",
"print": true
}
}
}
],
"setting": {
"speed": {
"channel": ""
}
}
}
}
根据模板修改配置
[root@node01 datax]# vim job/stream2_stream.json
{
"job": {
"content": [
{
"reader": {
"name": "streamreader",
"parameter": {
"sliceRecordCount": 10,
"column": [
{
"type": "long",
"value": "10"
},
{
"type": "string",
"value": "hello,DataX"
}
]
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "UTF-8",
"print": true
}
}
}
],
"setting": {
"speed": {
"channel": 1
}
}
}
}
执行脚本(部分日志)
[root@node101 datax]# pwd
/usr/local/datax
[root@node101 datax]# ls
bin conf job lib log log_perf plugin script tmp
[root@node101 datax]# ./bin/datax.py ./job/stream2_stream.json
DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved
2022-03-12 11:15:31.677 [main] INFO VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2022-03-12 11:15:31.693 [main] INFO Engine - the machine info =>
osInfo: Red Hat, Inc. 1.8 25.322-b06
jvmInfo: Linux amd64 4.4.219-1.el7.elrepo.x86_64
cpu num: 4
totalPhysicalMemory: -0.00G
freePhysicalMemory: -0.00G
maxFileDescriptorCount: -1
currentOpenFileDescriptorCount: -1
GC Names [PS MarkSweep, PS Scavenge]
MEMORY_NAME | allocation_size | init_size
PS Eden Space | 256.00MB | 256.00MB
Code Cache | 240.00MB | 2.44MB
Compressed Class Space | 1,024.00MB | 0.00MB
PS Survivor Space | 42.50MB | 42.50MB
PS Old Gen | 683.00MB | 683.00MB
Metaspace | -0.00MB | 0.00MB
2022-03-12 11:15:31.727 [main] INFO Engine -
{
"content":[
{
"reader":{
"name":"streamreader",
"parameter":{
"column":[
{
"type":"long",
"value":"10"
},
{
"type":"string",
"value":"hello,DataX"
}
],
"sliceRecordCount":10
}
},
"writer":{
"name":"streamwriter",
"parameter":{
"encoding":"UTF-8",
"print":true
}
}
}
],
"setting":{
"speed":{
"channel":1
}
}
}
2022-03-12 11:15:31.761 [main] WARN Engine - prioriy set to 0, because NumberFormatException, the value is: null
2022-03-12 11:15:31.765 [main] INFO PerfTrace - PerfTrace traceId=job_-1, isEnable=false, priority=0
2022-03-12 11:15:31.765 [main] INFO JobContainer - DataX jobContainer starts job.
2022-03-12 11:15:31.771 [main] INFO JobContainer - Set jobId = 0
2022-03-12 11:15:31.805 [job-0] INFO JobContainer - jobContainer starts to do prepare ...
2022-03-12 11:15:31.807 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] do prepare work .
2022-03-12 11:15:31.809 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] do prepare work .
2022-03-12 11:15:31.813 [job-0] INFO JobContainer - jobContainer starts to do split ...
2022-03-12 11:15:31.814 [job-0] INFO JobContainer - Job set Channel-Number to 1 channels.
2022-03-12 11:15:31.816 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] splits to [1] tasks.
2022-03-12 11:15:31.818 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] splits to [1] tasks.
2022-03-12 11:15:31.858 [job-0] INFO JobContainer - jobContainer starts to do schedule ...
2022-03-12 11:15:31.868 [job-0] INFO JobContainer - Scheduler starts [1] taskGroups.
2022-03-12 11:15:31.875 [job-0] INFO JobContainer - Running by standalone Mode.
2022-03-12 11:15:31.891 [taskGroup-0] INFO TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2022-03-12 11:15:31.900 [taskGroup-0] INFO Channel - Channel set byte_speed_limit to -1, No bps activated.
2022-03-12 11:15:31.900 [taskGroup-0] INFO Channel - Channel set record_speed_limit to -1, No tps activated.
2022-03-12 11:15:31.927 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
10 hello,DataX
2022-03-12 11:15:32.029 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[105]ms
2022-03-12 11:15:32.030 [taskGroup-0] INFO TaskGroupContainer - taskGroup[0] completed it's tasks.
2022-03-12 11:15:41.908 [job-0] INFO StandAloneJobContainerCommunicator - Total 10 records, 130 bytes | Speed 13B/s, 1 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.000s | All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-12 11:15:41.909 [job-0] INFO AbstractScheduler - Scheduler accomplished all tasks.
2022-03-12 11:15:41.910 [job-0] INFO JobContainer - DataX Writer.Job [streamwriter] do post work.
2022-03-12 11:15:41.911 [job-0] INFO JobContainer - DataX Reader.Job [streamreader] do post work.
2022-03-12 11:15:41.935 [job-0] INFO JobContainer - DataX jobId [0] completed successfully.
2022-03-12 11:15:41.937 [job-0] INFO HookInvoker - No hook invoked, because base dir not exists or is a file: /usr/local/datax/hook
2022-03-12 11:15:41.942 [job-0] INFO JobContainer -
[total cpu info] =>
averageCpu | maxDeltaCpu | minDeltaCpu
-1.00% | -1.00% | -1.00%
[total gc info] =>
NAME | totalGCCount | maxDeltaGCCount | minDeltaGCCount | totalGCTime | maxDeltaGCTime | minDeltaGCTime
PS MarkSweep | 0 | 0 | 0 | 0.000s | 0.000s | 0.000s
PS Scavenge | 0 | 0 | 0 | 0.000s | 0.000s | 0.000s
2022-03-12 11:15:41.943 [job-0] INFO JobContainer - PerfTrace not enable!
2022-03-12 11:15:41.945 [job-0] INFO StandAloneJobContainerCommunicator - Total 10 records, 130 bytes | Speed 13B/s, 1 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.000s | All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-12 11:15:41.949 [job-0] INFO JobContainer -
任务启动时刻 : 2022-03-12 11:15:31
任务结束时刻 : 2022-03-12 11:15:41
任务总计耗时 : 10s
任务平均流量 : 13B/s
记录写入速度 : 1rec/s
读出记录总数 : 10
读写失败总数 : 0
如果有以上输出,说明DATAX安装配置完成
2022-03-12
阿里巴巴开源异构数据源离线同步工具datax编译安装配置
评论
报错:
在有总bps限速条件下,单个channel的bps值不能为空,也不能为非正数
解决方法:
https://blog.csdn.net/chrisy521/article/details/122033748
修改core -> transport -> channel -> speed -> "byte": 2000000,将单个channel的大小改为2MB即可。
[root@node101 conf]# pwd
/usr/local/datax/conf
[root@node101 conf]# vim core.json
24 "transport": {
25 "channel": {
26 "class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
27 "speed": {
28 "byte": -1,
29 "record": -1,
30 "byte": 2000000 #新增行
报错:
在有总bps限速条件下,单个channel的bps值不能为空,也不能为非正数
解决方法:
https://blog.csdn.net/chrisy521/article/details/122033748
修改core -> transport -> channel -> speed -> "byte": 2000000,将单个channel的大小改为2MB即可。
[root@node101 conf]# pwd
/usr/local/datax/conf
[root@node101 conf]# vim core.json
24 "transport": {
25 "channel": {
26 "class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
27 "speed": {
28 "byte": -1,
29 "record": -1,
30 "byte": 2000000 #新增行
发表评论
姓 名: