From 626b6fe84bef7de9b299b61727d6a3c4a44ae9c7 Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 13:59:40 +0300 Subject: [PATCH 1/6] docs --- README.md | 105 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 8b57571..f05303b 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,11 @@ * [Introduction](#introduction) * [Requirements](#requirements) + * [Operation](#operation) + * [Requirements and Limitations](#requirements-and-limitations) + * [General Schema](#general-schema) + * [Example](#example) + * [Performance](#performance) * [Testing](#testing) * [MySQL Data Types](#mysql-data-types) * [ClickHouse Data Types](#clickhouse-data-types) @@ -18,23 +23,25 @@ # Introduction -Utility to read mysql data +Utility to import data into ClickHouse from MySQL (mainly) and/or CSV files # Requirements -This package is used for interacting with MySQL: +Data reader requires **Python 3.x** with additional modules to be installed. + +`mysql-replication` package is used for communication with MySQL: [https://github.com/noplay/python-mysql-replication](https://github.com/noplay/python-mysql-replication) ```bash pip install mysql-replication ``` -This package is used for interacting with ClickHouse: +`clickhouse-driver` package is used for communication with ClickHouse: [https://github.com/mymarilyn/clickhouse-driver](https://github.com/mymarilyn/clickhouse-driver) ```bash pip install clickhouse-driver ``` -You need (at least one of) the `SUPER`, `REPLICATION CLIENT` privilege(s) for this operation +Also the following (at least one of) MySQL privileges are required for this operation: `SUPER`, `REPLICATION CLIENT` ```sql CREATE USER 'reader'@'localhost' IDENTIFIED BY 'qwerty'; @@ -52,7 +59,7 @@ GRANT REPLICATION CLIENT, REPLICATION SLAVE, SUPER ON *.* TO 'reader'@'*' FLUSH PRIVILEGES; ``` -MySQL config options required: +Also the following MySQL config options are required: ```ini [mysqld] server-id = 1 @@ -62,6 +69,67 @@ max_binlog_size = 100M binlog-format = row #Very important if you want to receive write, update and delete row events ``` +# Operation + +## Requirements and Limitations + +Data reader understands INSERT SQL statements only. In practice this means that: + * You need to create required table in ClickHouse before starting data read procedure. More on how to create target ClickHouse table: [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * UPDATE statements are not handled - meaning UPDATEs within MySQL would not be relayed into ClickHouse + * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse + * DDL statements are not handled. For example, source table structure change can lead to insertion errors + +## General schema + + * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). + * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. + Cache can be flushed by either of: + * number of rows in cache + * number of events in cache + * time elapsed + * data source depleted + * Step 3. **OPTIONAL** Writing CSV file. Sometimes it is useful to have data also represented as a file + * Step 4. Writing data into ClickHouse. Depending on the configuration of the previous steps data are written into ClickHouse by either of: + * directly event-by-event or line-by-line + * from memory cache as a bulk insert operation + * from CSV file via `clickhouse-client` + +## Example + +Let's walk over test example of tool launch command line options + +```bash +$PYTHON main.py ${*:1} \ + --src-resume \ + --src-wait \ + --nice-pause=1 \ + --log-level=info \ + --log-file=ontime.log \ + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ + --csvpool \ + --csvpool-file-path-prefix=qwe_ \ + --mempool-max-flush-interval=60 \ + --mempool-max-events-num=1000 +``` +Options description + * `--src-resume` - resume data loading from the previous point. When the tool starts - resume from the end of the log + * `--src-wait` - wait for new data to come + * `--nice-pause=1` - when no data available sleep for 1 second + * `--log-level=info` - log verbosity + * `--log-file=ontime.log` - log file name + * `--src-host=127.0.0.1` - MySQL source host + * `--src-user=root` - MySQL source user (remember about PRIVILEGES for this user) + * `--dst-host=127.0.0.1` - ClickHouse host + * `--csvpool` - make pool of csv files (assumes `--mempool` also) + * `--csvpool-file-path-prefix=qwe_` - put these CSV files having `qwe_` prefix in `CWD` + * `--mempool-max-flush-interval=60` - flush mempool at least every 60 seconds + * `--mempool-max-events-num=1000` - flush mempool at least each 1000 events (not rows, but events) + +# Performance + + # Testing ## MySQL Data Types @@ -844,32 +912,15 @@ CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( ### Import Data ```bash -ls|sort|head -n 100 - -i=1 -for file in $(ls *.csv|sort|head -n 100); do - echo "$i. Copy $file" - cp -f $file ontime.csv - echo "$i. Import $file" - mysqlimport \ - --ignore-lines=1 \ - --fields-terminated-by=, \ - --fields-enclosed-by=\" \ - --local \ - -u root \ - airline ontime.csv - rm -f ontime.csv - i=$((i+1)) -done - #!/bin/bash files_to_import_num=3 i=1 for file in $(ls /mnt/nas/work/ontime/*.csv|sort|head -n $files_to_import_num); do - echo "$i. Prepare $file" + echo "$i. Prepare. Make link to $file" rm -f ontime ln -s $file ontime - echo "$i. Import $file" + + echo "$i. Import. $file" time mysqlimport \ --ignore-lines=1 \ --fields-terminated-by=, \ @@ -877,9 +928,9 @@ for file in $(ls /mnt/nas/work/ontime/*.csv|sort|head -n $files_to_import_num); --local \ -u root \ airline ontime + + echo "$i. Cleanup. $file" rm -f ontime i=$((i+1)) done - - ``` From 554498738fc1c0e1237b286be05404ca79e027d7 Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 14:00:14 +0300 Subject: [PATCH 2/6] add performance - pypy section into docs --- README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/README.md b/README.md index f05303b..1cccb66 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,38 @@ Options description # Performance +`pypy` significantly improves performance. You should try it. +For example you can start with [Portable PyPy distribution for Linux](#https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) +Unpack it into your place of choice. + +```bash +[user@localhost ~]$ ls -l pypy3.5-5.9-beta-linux_x86_64-portable +total 32 +drwxr-xr-x 2 user user 140 Oct 24 01:14 bin +drwxr-xr-x 5 user user 4096 Oct 3 11:57 include +drwxr-xr-x 4 user user 4096 Oct 3 11:57 lib +drwxr-xr-x 13 user user 4096 Oct 3 11:56 lib_pypy +drwxr-xr-x 3 user user 15 Oct 3 11:56 lib-python +-rw-r--r-- 1 user user 11742 Oct 3 11:56 LICENSE +-rw-r--r-- 1 user user 1296 Oct 3 11:56 README.rst +drwxr-xr-x 14 user user 4096 Oct 24 01:16 site-packages +drwxr-xr-x 2 user user 195 Oct 3 11:57 virtualenv_support +``` + +Install `pip` +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy -m ensurepip +``` +Install required modules +```bash +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install mysql-replication +pypy3.5-5.9-beta-linux_x86_64-portable/bin/pip3 install clickhouse-driver +``` + +Now you can run data reader via `pypy` +```bash +/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy main.py +``` # Testing From b80abc880bd2cec3b739dd0323f6e980df2612f1 Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 14:02:32 +0300 Subject: [PATCH 3/6] docs: URL fix --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1cccb66..ad429d1 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ Options description # Performance `pypy` significantly improves performance. You should try it. -For example you can start with [Portable PyPy distribution for Linux](#https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) +For example you can start with [Portable PyPy distribution for Linux](https://github.com/squeaky-pl/portable-pypy#portable-pypy-distribution-for-linux) Unpack it into your place of choice. ```bash From 0037d45aae0de7e4877d3e65cd32dcb86c879ac6 Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 14:32:01 +0300 Subject: [PATCH 4/6] airline.ontime test scripts --- ...me.sh => run_airline_ontime_data_reader.sh | 4 +++ run_airline_ontime_import.sh | 29 +++++++++++++++++++ 2 files changed, 33 insertions(+) rename run_ontime.sh => run_airline_ontime_data_reader.sh (81%) create mode 100755 run_airline_ontime_import.sh diff --git a/run_ontime.sh b/run_airline_ontime_data_reader.sh similarity index 81% rename from run_ontime.sh rename to run_airline_ontime_data_reader.sh index 837ed97..4157d0b 100755 --- a/run_ontime.sh +++ b/run_airline_ontime_data_reader.sh @@ -1,7 +1,11 @@ #!/bin/bash +# read airline.ontime test dataset from MySQL and write it to CH +# ugly stub to suppress unsufficient sockets sudo bash -c "echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse" +# run data reader with specified Python version + PYTHON=python3.6 PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy diff --git a/run_airline_ontime_import.sh b/run_airline_ontime_import.sh new file mode 100755 index 0000000..b73fb41 --- /dev/null +++ b/run_airline_ontime_import.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# import airline.ontime test dataset into MySQL + +# looking for csv files in this dir +FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" + +# limit import to this number of files +FILES_TO_IMPORT_NUM=3 + +i=1 +for file in $(ls "$FILES_TO_IMPORT_DIR"/*.csv|sort|head -n $FILES_TO_IMPORT_NUM); do + echo "$i. Prepare. Make link to $file" + rm -f ontime + ln -s $file ontime + + echo "$i. Import. $file" + time mysqlimport \ + --ignore-lines=1 \ + --fields-terminated-by=, \ + --fields-enclosed-by=\" \ + --local \ + -u root \ + airline ontime + + echo "$i. Cleanup. $file" + rm -f ontime + + i=$((i+1)) +done From 2268b95ff6a1a570b372893ae8899849cba5c511 Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 15:20:41 +0300 Subject: [PATCH 5/6] airline.ontime dataset --- README.md | 100 +++++++++++++++------------- run_airline_ontime_data_download.sh | 46 +++++++++++++ 2 files changed, 100 insertions(+), 46 deletions(-) create mode 100755 run_airline_ontime_data_download.sh diff --git a/README.md b/README.md index ad429d1..3c6316d 100644 --- a/README.md +++ b/README.md @@ -8,16 +8,18 @@ * [Requirements](#requirements) * [Operation](#operation) * [Requirements and Limitations](#requirements-and-limitations) - * [General Schema](#general-schema) + * [Operation General Schema](#operation-general-schema) * [Example](#example) * [Performance](#performance) * [Testing](#testing) - * [MySQL Data Types](#mysql-data-types) - * [ClickHouse Data Types](#clickhouse-data-types) - * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) - * [MySQL Test Tables](#mysql-test-tables) - * [ClickHouse Test Tables](#clickhouse-test-tables) - * [Airline ontime Test Case](#airline-ontime-test-case) + * [Testing General Schema](#testing-general-schema) + * [MySQL Data Types](#mysql-data-types) + * [ClickHouse Data Types](#clickhouse-data-types) + * [MySQL -> ClickHouse Data Types Mapping](#mysql---clickhouse-data-types-mapping) + * [MySQL Test Tables](#mysql-test-tables) + * [ClickHouse Test Tables](#clickhouse-test-tables) + * [Test Cases](#test-cases) + * [airline.ontime Test Case](#airline-ontime-test-case) --- @@ -79,7 +81,7 @@ Data reader understands INSERT SQL statements only. In practice this means that: * DELETE statements are not handled - meaning DELETEs within MySQL would not be relayed into ClickHouse * DDL statements are not handled. For example, source table structure change can lead to insertion errors -## General schema +## Operation General Schema * Step 1. Data Reader reads data from the source event-by-event (for MySQL binlog) or line-by-line (file). * Step 2. **OPTIONAL** Caching in memory pool. Since ClickHouse prefers to get data in bundles (row-by-row insertion is extremely slow), we need to introduce some caching. @@ -164,9 +166,11 @@ Now you can run data reader via `pypy` # Testing -## MySQL Data Types +## Testing General Schema -### Numeric Types +### MySQL Data Types + +#### Numeric Types * `BIT` the number of bits per value, from 1 to 64 * `TINYINT` -128 to 127. The unsigned range is 0 to 255 @@ -182,7 +186,7 @@ Now you can run data reader via `pypy` * `DOUBLE`, `REAL` Permissible values are -1.7976931348623157E+308 to -2.2250738585072014E-308, 0, and 2.2250738585072014E-308 to 1.7976931348623157E+308 -### Date and Time Types +#### Date and Time Types * `DATE` The supported range is '1000-01-01' to '9999-12-31' * `DATETIME` The supported range is '1000-01-01 00:00:00.000000' to '9999-12-31 23:59:59.999999' @@ -190,7 +194,7 @@ Now you can run data reader via `pypy` * `TIME` The range is '-838:59:59.000000' to '838:59:59.000000' * `YEAR` Values display as 1901 to 2155, and 0000 -### String Types +#### String Types * `CHAR` The range of M is 0 to 255. If M is omitted, the length is 1. * `VARCHAR` The range of M is 0 to 65,535 * `BINARY` similar to CHAR @@ -210,7 +214,7 @@ Now you can run data reader via `pypy` --- -## ClickHouse Data Types +### ClickHouse Data Types * `Date` number of days since 1970-01-01 * `DateTime` Unix timestamp @@ -234,9 +238,9 @@ Now you can run data reader via `pypy` --- -## MySQL -> ClickHouse Data Types Mapping +### MySQL -> ClickHouse Data Types Mapping -### Numeric Types +#### Numeric Types * `BIT` -> ??? (possibly `String`?) * `TINYINT` -> `Int8`, `UInt8` @@ -252,7 +256,7 @@ Now you can run data reader via `pypy` * `DOUBLE`, `REAL` -> `Float64` -### Date and Time Types +#### Date and Time Types * `DATE` -> `Date` (for valid values) or `String` (`Date` Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2038, but it may be expanded to 2106)) * `DATETIME` -> `DateTime` (for valid values) or `String` @@ -261,7 +265,7 @@ Now you can run data reader via `pypy` * `YEAR` -> `UInt16` -### String Types +#### String Types * `CHAR` -> `FixedString` * `VARCHAR` -> `String` @@ -282,7 +286,7 @@ Now you can run data reader via `pypy` * `JSON` -> ?????? (possibly `String`?) -## MySQL Test Tables +### MySQL Test Tables We have to separate test table into several ones because of this error, produced by MySQL: ```text @@ -594,7 +598,7 @@ INSERT INTO long_varbinary_datatypes SET ; ``` -## ClickHouse Test Tables +### ClickHouse Test Tables ```sql CREATE TABLE datatypes( @@ -705,9 +709,26 @@ CREATE TABLE long_varbinary_datatypes( ; ``` -## Airline ontime Test Case +## Test Cases + +### airline.ontime Test Case +Main Steps + * Download airline.ontime dataset + * Create airline.ontime MySQL table + * Create airline.ontime ClickHouse table + * Start data reader (migrate data MySQL -> ClickHouse) + * Start data importer (import data into MySQL) + * Check how data are loaded into ClickHouse + +#### airline.ontime Data Set in CSV files +Run [download script](run_airline_ontime_data_download.sh) +```bash +./run_airline_ontime_data_download.sh +``` +Downloading can take some time. -### MySQL Table +#### airline.ontime MySQL Table +Create MySQL table of the following structure: ```sql CREATE DATABASE IF NOT EXISTS `airline`; @@ -824,8 +845,8 @@ CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( ); ``` -### ClickHouse Table - +#### airline.ontime ClickHouse Table +Create ClickHouse table of the following structure: ```sql CREATE DATABASE IF NOT EXISTS `airline`; CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( @@ -941,28 +962,15 @@ CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( ) ENGINE = MergeTree(FlightDate, (FlightDate, Year, Month, DepDel15), 8192) ``` -### Import Data - +#### airline.ontime Data Reader +Run [datareader script](run_airline_ontime_data_reader.sh) ```bash -#!/bin/bash -files_to_import_num=3 -i=1 -for file in $(ls /mnt/nas/work/ontime/*.csv|sort|head -n $files_to_import_num); do - echo "$i. Prepare. Make link to $file" - rm -f ontime - ln -s $file ontime - - echo "$i. Import. $file" - time mysqlimport \ - --ignore-lines=1 \ - --fields-terminated-by=, \ - --fields-enclosed-by=\" \ - --local \ - -u root \ - airline ontime - - echo "$i. Cleanup. $file" - rm -f ontime - i=$((i+1)) -done +./run_airline_ontime_data_reader.sh ``` + +#### airline.ontime Data Importer +Run [data importer script](run_airline_ontime_import.sh) + +```bash +./run_airline_ontime_import.sh +``` diff --git a/run_airline_ontime_data_download.sh b/run_airline_ontime_data_download.sh new file mode 100755 index 0000000..85939c7 --- /dev/null +++ b/run_airline_ontime_data_download.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# download airline.ontime test dataset + +echo "Check required commands availability" +if command -v wget && command -v unzip && command -v clickhouse-client && command -v wc && command -v awk; then + echo "Looks like all required commands are available" +else + echo "Please ensure availability of: wget && unzip && clickhouse-client && wc && awk" + exit 1 +fi + +echo "Download dataset" + +ZIP_FILES_DIR="zip" +echo "Create dir $ZIP_FILES_DIR for downloading zip files" +mkdir "$ZIP_FILES_DIR" + +if [ ! -d "$ZIP_FILES_DIR" ]; then + "Can' use dir: $ZIP_FILES_DIR - not available" + exit 1 +fi + +echo "Download files into $ZIP_FILES_DIR" +for year in `seq 1987 2017`; do + for month in `seq 1 12`; do + FILE_NAME="On_Time_On_Time_Performance_${year}_${month}.zip" + wget -O "$ZIP_FILES_DIR/$FILE_NAME" "http://transtats.bts.gov/PREZIP/$FILE_NAME" + done +done + +echo "Unzip dataset" + +ZIP_FILES_DIR="zip" +CSV_FILES_DIR="csv" + +mkdir "$CSV_FILES_DIR" + +if [ ! -d "$CSV_FILES_DIR" ]; then + "Can' use dir: $CSV_FILES_DIR - not available" + exit 1 +fi + +for ZIP_FILENAME in `ls "$ZIP_FILES_DIR"/*.zip`; do + echo "Unzipping $ZIP_FILENAME to $CSV_FILES_DIR/" + unzip -o "$ZIP_FILENAME" -d "$CSV_FILES_DIR/" +done From d4ad93fe243284a242021722d29b3bfcb776025f Mon Sep 17 00:00:00 2001 From: sunsingerus Date: Tue, 21 Nov 2017 15:32:48 +0300 Subject: [PATCH 6/6] airline.ontime testcase --- README.md | 34 ++++++++++++++++++++++++++++- run_airline_ontime_data_download.sh | 7 +++--- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 3c6316d..2078fea 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ * [MySQL Test Tables](#mysql-test-tables) * [ClickHouse Test Tables](#clickhouse-test-tables) * [Test Cases](#test-cases) - * [airline.ontime Test Case](#airline-ontime-test-case) + * [airline.ontime Test Case](#airlineontime-test-case) --- @@ -722,6 +722,13 @@ Main Steps #### airline.ontime Data Set in CSV files Run [download script](run_airline_ontime_data_download.sh) +You may want to adjust dirs where to keep `ZIP` and `CSV` file +In `run_airline_ontime_data_download.sh` edit these lines: +```bash +ZIP_FILES_DIR="zip" +CSV_FILES_DIR="csv" +``` + ```bash ./run_airline_ontime_data_download.sh ``` @@ -964,12 +971,37 @@ CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( #### airline.ontime Data Reader Run [datareader script](run_airline_ontime_data_reader.sh) +You may want to adjust `PYTHON` path and source and target hosts and usernames +```bash +PYTHON=python3.6 +PYTHON=/home/user/pypy3.5-5.9-beta-linux_x86_64-portable/bin/pypy +``` +```bash +... + --src-host=127.0.0.1 \ + --src-user=root \ + --dst-host=127.0.0.1 \ +... +``` ```bash ./run_airline_ontime_data_reader.sh ``` #### airline.ontime Data Importer Run [data importer script](run_airline_ontime_import.sh) +You may want to adjust `CSV` files location, number of imported files and MySQL user/password used for import +```bash +# looking for csv files in this dir +FILES_TO_IMPORT_DIR="/mnt/nas/work/ontime" + +# limit import to this number of files +FILES_TO_IMPORT_NUM=3 +``` +```bash +... + -u root \ +... +``` ```bash ./run_airline_ontime_import.sh diff --git a/run_airline_ontime_data_download.sh b/run_airline_ontime_data_download.sh index 85939c7..f1bdcce 100755 --- a/run_airline_ontime_data_download.sh +++ b/run_airline_ontime_data_download.sh @@ -1,6 +1,9 @@ #!/bin/bash # download airline.ontime test dataset +ZIP_FILES_DIR="zip" +CSV_FILES_DIR="csv" + echo "Check required commands availability" if command -v wget && command -v unzip && command -v clickhouse-client && command -v wc && command -v awk; then echo "Looks like all required commands are available" @@ -11,7 +14,6 @@ fi echo "Download dataset" -ZIP_FILES_DIR="zip" echo "Create dir $ZIP_FILES_DIR for downloading zip files" mkdir "$ZIP_FILES_DIR" @@ -30,9 +32,6 @@ done echo "Unzip dataset" -ZIP_FILES_DIR="zip" -CSV_FILES_DIR="csv" - mkdir "$CSV_FILES_DIR" if [ ! -d "$CSV_FILES_DIR" ]; then