msropp · rvhirsch · Nov 23, 2020
diff --git a/create_parsable_tsv.py b/create_parsable_tsv.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+
+"""
+usage: python3 create_parsable_tsv.py path/to/file
+"""
+
+import sys
+import re
+
+
+"""
+Takes filename and returns entire contents
+
+Note: filename must end with '.tsv' and first line must match (after cleaning)
+        id\tfirst_name\tlast_name\taccount_number\temail
+"""
+def get_file_contents(filename):
+    with open(filename, 'r', encoding='utf-16le') as tsv_file:
+        return tsv_file.readlines()
+
+
+"""
+Takes original data from file and turns it into a string that looks like:
+        1\tFistName\tLastName\t123456\[email protected]
+"""
+def clean_line(line_of_data):
+    # line should always look like: 1\tFistName\tLastName\t123456\[email protected]
+    fixed_data = line_of_data.replace('\x00','')
+    while '\t\t' in fixed_data:
+        fixed_data = fixed_data.replace("\t\t", "\t\'\\t\'\t")
+
+    return '\t'.join(fixed_data.split())
+
+
+"""
+Regex list works for this file only - would change for other files
+
+Specified for: id first_name last_name account_number email
+"""
+def get_hardcoded_table_data_regex():
+    return ['^[0-9]+', '[a-zA-Z]+', '[a-zA-Z]+', '^[0-9]', '\S+@\S+']
+
+
+"""
+Checks line for too many/not enough values and deletes/adds fields
+"""
+def fix_line_item_issues(item):
+    regexes = get_hardcoded_table_data_regex()
+
+    values = item.split()
+    for i in range(len(regexes)):
+        regex = regexes[i]
+        val = values[i]
+        if not re.search(regex, val):
+            if len(regexes) > len(values):
+                values.insert(i, '\t')
+            else:
+                values.remove(val)
+        else:
+            if i == 1 or i == 2:
+                if '\\t' not in val:
+                    values[i] = val.title()
+            elif i == 4:
+                values[i] = val.lower()
+
+    return '\t'.join(values)
+
+
+"""
+Loops through entire file contents and creates new, cleaned contents
+"""
+def create_parsable_file_contents(file_contents):
+    # special case for file header
+    header = clean_line(file_contents[0])
+    new_contents = [header +'\n']
+
+    last_str = ""
+    last_num = 1
+
+    for i in range(1, len(file_contents)):
+        # remove null characters in line
+        cleaned_line = clean_line(file_contents[i])
+
+        # start of new data entry found
+        if cleaned_line.startswith(str(last_num+1)):
+            new_contents.append(fix_line_item_issues(last_str) + '\n')
+
+            # special case - last line in file
+            if i == len(file_contents)-1:
+                new_contents.append(fix_line_item_issues(cleaned_line))
+                return new_contents
+
+            last_num += 1
+            last_str = cleaned_line
+        else:
+            # special case - don't include \t for header (line 1)
+            last_str += '\t' + cleaned_line if i != 1 else cleaned_line
+
+    return new_contents
+
+
+"""
+Writes new, cleaned contents to file with name "path/to/file_parsable.tsv"
+"""
+def create_parsable_file(file_contents, filename):
+    parsable_filename = filename.replace('.tsv', '_parsable.tsv')
+    with open(parsable_filename, 'w', encoding='utf-8') as new_tsv_file:
+        new_tsv_file.writelines(file_contents)
+
+
+def main():
+    filename = sys.argv[1]  # 1st argument should be path/to/filename.tsv
+    file_contents = get_file_contents(filename)
+    parsable_file_contents = create_parsable_file_contents(file_contents)
+    create_parsable_file(parsable_file_contents, filename)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/create_parsable_tsv_notes.md b/create_parsable_tsv_notes.md
@@ -0,0 +1,19 @@
+### Script Usage Notes
+
+Usage: `python3 convert_to_tsv.py path/to/file`
+
+Creates a new file in the same folder as the original data called filename_parsable.tsv
+(note: filename must end with ".tsv")
+
+Anomalies found in the original data.tsv:
+* Null characters had to be removed
+* Unnecessary newline characters within lines of data had to be dealt with
+  * In all cases, data was concatenated across lines
+* Some names included more or less than first_name/last_name. For example:
+  * Adena Hobbs Bosley (line 29) - name became "Adena Hobbs"
+  * Boris Harrington Harrington (line 217) - name became "Boris Harrington"
+  * Copeland (line 302) - name became "Copeland '\t'"
+
+A Redshift data creation script is also included.
+The `upload_to_redshoft.py` file first pushes the data to AWS/S3, then copies it to Redshift.
+* Note: credentials must first be updated in the file before running.
diff --git a/test_convert_to_tsv.py b/test_convert_to_tsv.py
@@ -0,0 +1,47 @@
+from unittest import TestCase
+import create_parsable_tsv
+
+
+class test_create_parsable_tsv(TestCase):
+    def test_get_file_contents(self):
+        filename = "./data/data.tsv"
+        contents = create_parsable_tsv.get_file_contents(filename)
+
+        expected_length = 1008
+        actual_length = len(contents)
+
+        self.assertEqual(expected_length, actual_length)
+
+    def test_clean_line_line1(self):
+        filename = "./data/data.tsv"
+        input = create_parsable_tsv.get_file_contents(filename)[1]
+
+        expectedOutput = "1\tAddison\tMarks\t196296\[email protected]"
+        actualOutput = create_parsable_tsv.clean_line(input)
+
+        self.assertEqual(expectedOutput, actualOutput)
+
+    def test_clean_line_line613(self):
+        filename = "./data/data.tsv"
+        input = create_parsable_tsv.get_file_contents(filename)[618]
+
+        expectedOutput = "613\t'\\t'\t'\\t'\t104969\[email protected]"
+        actualOutput = create_parsable_tsv.clean_line(input)
+
+        self.assertEqual(expectedOutput, actualOutput)
+
+    def test_fix_line_item_issues(self):
+        item = "29\tAdena\tHobbs\tBosley\t656184\[email protected]"
+        expectedOutput = "29\tAdena\tHobbs\t656184\[email protected]"
+        actualOutput = create_parsable_tsv.fix_line_item_issues(item)
+
+        self.assertEqual(expectedOutput, actualOutput)
+
+    def test_create_parsable_file_contents(self):
+        filename = "./data/data.tsv"
+        input = create_parsable_tsv.get_file_contents(filename)[:3]
+
+        expectedOutput = ['id\tfirst_name\tlast_name\taccount_number\temail\n', '1\tAddison\tMarks\t196296\[email protected]\n', '2\tDakota\tGarza\t409025\[email protected]']
+        actualOutput = create_parsable_tsv.create_parsable_file_contents(input)
+
+        self.assertEqual(expectedOutput, actualOutput)
diff --git a/upload_to_redshift.py b/upload_to_redshift.py
@@ -0,0 +1,107 @@
+import psycopg2
+
+"""
+Hardcoded list of data types in data.tsv
+"""
+def get_data_types():
+            # id       first_name last_name  account_number  email
+    return ['smallint', 'varchar', 'varchar', 'smallint', 'varchar']
+
+
+"""
+Grabs all data from tsv file
+"""
+def get_all_data(filepath):
+    with open(filepath, 'r') as datafile:
+        return datafile.readlines()
+
+
+"""
+Goes through data to find longest value for each data type
+
+Used for table creation, specifically for varchar types
+"""
+def find_varchar_lengths(data):
+    lengths = [0,0,0,0,0]
+    for line in data:
+        vals = line.split('\t')
+        for i in range(len(vals)):
+            val_length = len(vals[i])
+            if val_length > lengths[i]:
+                lengths[i] = val_length
+    return lengths
+
+
+"""
+Builds create table statement to eventually read data into redshift
+"""
+def create_table(headers, types, lengths):
+    statement = "create table user_data ("
+    for i in range(len(types)):
+        if types[i] == 'varchar':
+            statement = (statement + '\n{} varchar({})').format(headers[i].lower(), lengths[i])
+        else:
+            statement = (statement + '\n{} {},'.format(headers[i].lower(), types[i]))
+    return statement[:-1] + ');'
+
+
+"""
+Put data on AWS/S3 to push to redshift
+"""
+def set_up_data_on_s3(statement, host, user, port, password):
+    # set up connection
+
+    connection = psycopg2.connect(
+        host=host,
+        user=user,
+        port=port,
+        password=password,
+        dbname='tsv_data_db')
+
+    cursor = connection.cursor()
+
+    cursor.execute(statement)
+    connection.commit()
+
+    return connection, cursor
+
+
+"""
+Push data from AWS/S3 to redshift
+"""
+def push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key):
+    sql = """copy tsv_data from '""" + filepath + """'
+        access_key_id '""" + access_key_id + """'
+        secret_access_key '""" + secret_access_key + """'
+        region 'us-west-1'
+        ignoreheader 1
+        null as 'NA'
+        removequotes
+        delimiter '\t';"""
+
+    s3_cursor.execute(sql)
+    s3_connection.commit()
+
+
+def main():
+    filepath = "./data/data_parsable.tsv"
+    data = get_all_data(filepath)
+    headers = data[0].replace('\n', '').split('\t')
+    types = get_data_types()
+    lengths = find_varchar_lengths(data)
+
+    statement = create_table(headers, types, lengths)
+
+    # note: update these values with relevant credentials
+    host = 'mydb.mydatabase.us-west-2.redshift.amazonaws.com'
+    user = 'user'
+    port = 1234
+    password = 'password'
+    access_key_id = '<access_key_id>'
+    secret_access_key = '<secret_access_key>'
+
+    s3_connection, s3_cursor = set_up_data_on_s3(statement, host, user, port, password)
+    push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key)
+
+if __name__ == '__main__':
+    main()