diff --git a/create_parsable_tsv.py b/create_parsable_tsv.py new file mode 100644 index 0000000..1d79c3d --- /dev/null +++ b/create_parsable_tsv.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + + +""" +usage: python3 create_parsable_tsv.py path/to/file +""" + +import sys +import re + + +""" +Takes filename and returns entire contents + +Note: filename must end with '.tsv' and first line must match (after cleaning) + id\tfirst_name\tlast_name\taccount_number\temail +""" +def get_file_contents(filename): + with open(filename, 'r', encoding='utf-16le') as tsv_file: + return tsv_file.readlines() + + +""" +Takes original data from file and turns it into a string that looks like: + 1\tFistName\tLastName\t123456\temail@abc.com +""" +def clean_line(line_of_data): + # line should always look like: 1\tFistName\tLastName\t123456\temail@abc.com + fixed_data = line_of_data.replace('\x00','') + while '\t\t' in fixed_data: + fixed_data = fixed_data.replace("\t\t", "\t\'\\t\'\t") + + return '\t'.join(fixed_data.split()) + + +""" +Regex list works for this file only - would change for other files + +Specified for: id first_name last_name account_number email +""" +def get_hardcoded_table_data_regex(): + return ['^[0-9]+', '[a-zA-Z]+', '[a-zA-Z]+', '^[0-9]', '\S+@\S+'] + + +""" +Checks line for too many/not enough values and deletes/adds fields +""" +def fix_line_item_issues(item): + regexes = get_hardcoded_table_data_regex() + + values = item.split() + for i in range(len(regexes)): + regex = regexes[i] + val = values[i] + if not re.search(regex, val): + if len(regexes) > len(values): + values.insert(i, '\t') + else: + values.remove(val) + else: + if i == 1 or i == 2: + if '\\t' not in val: + values[i] = val.title() + elif i == 4: + values[i] = val.lower() + + return '\t'.join(values) + + +""" +Loops through entire file contents and creates new, cleaned contents +""" +def create_parsable_file_contents(file_contents): + # special case for file header + header = clean_line(file_contents[0]) + new_contents = [header +'\n'] + + last_str = "" + last_num = 1 + + for i in range(1, len(file_contents)): + # remove null characters in line + cleaned_line = clean_line(file_contents[i]) + + # start of new data entry found + if cleaned_line.startswith(str(last_num+1)): + new_contents.append(fix_line_item_issues(last_str) + '\n') + + # special case - last line in file + if i == len(file_contents)-1: + new_contents.append(fix_line_item_issues(cleaned_line)) + return new_contents + + last_num += 1 + last_str = cleaned_line + else: + # special case - don't include \t for header (line 1) + last_str += '\t' + cleaned_line if i != 1 else cleaned_line + + return new_contents + + +""" +Writes new, cleaned contents to file with name "path/to/file_parsable.tsv" +""" +def create_parsable_file(file_contents, filename): + parsable_filename = filename.replace('.tsv', '_parsable.tsv') + with open(parsable_filename, 'w', encoding='utf-8') as new_tsv_file: + new_tsv_file.writelines(file_contents) + + +def main(): + filename = sys.argv[1] # 1st argument should be path/to/filename.tsv + file_contents = get_file_contents(filename) + parsable_file_contents = create_parsable_file_contents(file_contents) + create_parsable_file(parsable_file_contents, filename) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/create_parsable_tsv_notes.md b/create_parsable_tsv_notes.md new file mode 100644 index 0000000..7546aaa --- /dev/null +++ b/create_parsable_tsv_notes.md @@ -0,0 +1,19 @@ +### Script Usage Notes + +Usage: `python3 convert_to_tsv.py path/to/file` + +Creates a new file in the same folder as the original data called filename_parsable.tsv +(note: filename must end with ".tsv") + +Anomalies found in the original data.tsv: +* Null characters had to be removed +* Unnecessary newline characters within lines of data had to be dealt with + * In all cases, data was concatenated across lines +* Some names included more or less than first_name/last_name. For example: + * Adena Hobbs Bosley (line 29) - name became "Adena Hobbs" + * Boris Harrington Harrington (line 217) - name became "Boris Harrington" + * Copeland (line 302) - name became "Copeland '\t'" + +A Redshift data creation script is also included. +The `upload_to_redshoft.py` file first pushes the data to AWS/S3, then copies it to Redshift. +* Note: credentials must first be updated in the file before running. \ No newline at end of file diff --git a/test_convert_to_tsv.py b/test_convert_to_tsv.py new file mode 100644 index 0000000..1b9d0db --- /dev/null +++ b/test_convert_to_tsv.py @@ -0,0 +1,47 @@ +from unittest import TestCase +import create_parsable_tsv + + +class test_create_parsable_tsv(TestCase): + def test_get_file_contents(self): + filename = "./data/data.tsv" + contents = create_parsable_tsv.get_file_contents(filename) + + expected_length = 1008 + actual_length = len(contents) + + self.assertEqual(expected_length, actual_length) + + def test_clean_line_line1(self): + filename = "./data/data.tsv" + input = create_parsable_tsv.get_file_contents(filename)[1] + + expectedOutput = "1\tAddison\tMarks\t196296\tornare.lectus@et.edu" + actualOutput = create_parsable_tsv.clean_line(input) + + self.assertEqual(expectedOutput, actualOutput) + + def test_clean_line_line613(self): + filename = "./data/data.tsv" + input = create_parsable_tsv.get_file_contents(filename)[618] + + expectedOutput = "613\t'\\t'\t'\\t'\t104969\tdictum@Suspendisse.net" + actualOutput = create_parsable_tsv.clean_line(input) + + self.assertEqual(expectedOutput, actualOutput) + + def test_fix_line_item_issues(self): + item = "29\tAdena\tHobbs\tBosley\t656184\tac.ipsum.Phasellus@ut.net" + expectedOutput = "29\tAdena\tHobbs\t656184\tac.ipsum.phasellus@ut.net" + actualOutput = create_parsable_tsv.fix_line_item_issues(item) + + self.assertEqual(expectedOutput, actualOutput) + + def test_create_parsable_file_contents(self): + filename = "./data/data.tsv" + input = create_parsable_tsv.get_file_contents(filename)[:3] + + expectedOutput = ['id\tfirst_name\tlast_name\taccount_number\temail\n', '1\tAddison\tMarks\t196296\tornare.lectus@et.edu\n', '2\tDakota\tGarza\t409025\tscelerisque@praesentluctus.edu'] + actualOutput = create_parsable_tsv.create_parsable_file_contents(input) + + self.assertEqual(expectedOutput, actualOutput) diff --git a/upload_to_redshift.py b/upload_to_redshift.py new file mode 100644 index 0000000..7a142dc --- /dev/null +++ b/upload_to_redshift.py @@ -0,0 +1,107 @@ +import psycopg2 + +""" +Hardcoded list of data types in data.tsv +""" +def get_data_types(): + # id first_name last_name account_number email + return ['smallint', 'varchar', 'varchar', 'smallint', 'varchar'] + + +""" +Grabs all data from tsv file +""" +def get_all_data(filepath): + with open(filepath, 'r') as datafile: + return datafile.readlines() + + +""" +Goes through data to find longest value for each data type + +Used for table creation, specifically for varchar types +""" +def find_varchar_lengths(data): + lengths = [0,0,0,0,0] + for line in data: + vals = line.split('\t') + for i in range(len(vals)): + val_length = len(vals[i]) + if val_length > lengths[i]: + lengths[i] = val_length + return lengths + + +""" +Builds create table statement to eventually read data into redshift +""" +def create_table(headers, types, lengths): + statement = "create table user_data (" + for i in range(len(types)): + if types[i] == 'varchar': + statement = (statement + '\n{} varchar({})').format(headers[i].lower(), lengths[i]) + else: + statement = (statement + '\n{} {},'.format(headers[i].lower(), types[i])) + return statement[:-1] + ');' + + +""" +Put data on AWS/S3 to push to redshift +""" +def set_up_data_on_s3(statement, host, user, port, password): + # set up connection + + connection = psycopg2.connect( + host=host, + user=user, + port=port, + password=password, + dbname='tsv_data_db') + + cursor = connection.cursor() + + cursor.execute(statement) + connection.commit() + + return connection, cursor + + +""" +Push data from AWS/S3 to redshift +""" +def push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key): + sql = """copy tsv_data from '""" + filepath + """' + access_key_id '""" + access_key_id + """' + secret_access_key '""" + secret_access_key + """' + region 'us-west-1' + ignoreheader 1 + null as 'NA' + removequotes + delimiter '\t';""" + + s3_cursor.execute(sql) + s3_connection.commit() + + +def main(): + filepath = "./data/data_parsable.tsv" + data = get_all_data(filepath) + headers = data[0].replace('\n', '').split('\t') + types = get_data_types() + lengths = find_varchar_lengths(data) + + statement = create_table(headers, types, lengths) + + # note: update these values with relevant credentials + host = 'mydb.mydatabase.us-west-2.redshift.amazonaws.com' + user = 'user' + port = 1234 + password = 'password' + access_key_id = '' + secret_access_key = '' + + s3_connection, s3_cursor = set_up_data_on_s3(statement, host, user, port, password) + push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key) + +if __name__ == '__main__': + main() \ No newline at end of file