Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions create_parsable_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/usr/bin/env python


"""
usage: python3 create_parsable_tsv.py path/to/file
"""

import sys
import re


"""
Takes filename and returns entire contents

Note: filename must end with '.tsv' and first line must match (after cleaning)
id\tfirst_name\tlast_name\taccount_number\temail
"""
def get_file_contents(filename):
with open(filename, 'r', encoding='utf-16le') as tsv_file:
return tsv_file.readlines()


"""
Takes original data from file and turns it into a string that looks like:
1\tFistName\tLastName\t123456\[email protected]
"""
def clean_line(line_of_data):
# line should always look like: 1\tFistName\tLastName\t123456\[email protected]
fixed_data = line_of_data.replace('\x00','')
while '\t\t' in fixed_data:
fixed_data = fixed_data.replace("\t\t", "\t\'\\t\'\t")

return '\t'.join(fixed_data.split())


"""
Regex list works for this file only - would change for other files

Specified for: id first_name last_name account_number email
"""
def get_hardcoded_table_data_regex():
return ['^[0-9]+', '[a-zA-Z]+', '[a-zA-Z]+', '^[0-9]', '\S+@\S+']


"""
Checks line for too many/not enough values and deletes/adds fields
"""
def fix_line_item_issues(item):
regexes = get_hardcoded_table_data_regex()

values = item.split()
for i in range(len(regexes)):
regex = regexes[i]
val = values[i]
if not re.search(regex, val):
if len(regexes) > len(values):
values.insert(i, '\t')
else:
values.remove(val)
else:
if i == 1 or i == 2:
if '\\t' not in val:
values[i] = val.title()
elif i == 4:
values[i] = val.lower()

return '\t'.join(values)


"""
Loops through entire file contents and creates new, cleaned contents
"""
def create_parsable_file_contents(file_contents):
# special case for file header
header = clean_line(file_contents[0])
new_contents = [header +'\n']

last_str = ""
last_num = 1

for i in range(1, len(file_contents)):
# remove null characters in line
cleaned_line = clean_line(file_contents[i])

# start of new data entry found
if cleaned_line.startswith(str(last_num+1)):
new_contents.append(fix_line_item_issues(last_str) + '\n')

# special case - last line in file
if i == len(file_contents)-1:
new_contents.append(fix_line_item_issues(cleaned_line))
return new_contents

last_num += 1
last_str = cleaned_line
else:
# special case - don't include \t for header (line 1)
last_str += '\t' + cleaned_line if i != 1 else cleaned_line

return new_contents


"""
Writes new, cleaned contents to file with name "path/to/file_parsable.tsv"
"""
def create_parsable_file(file_contents, filename):
parsable_filename = filename.replace('.tsv', '_parsable.tsv')
with open(parsable_filename, 'w', encoding='utf-8') as new_tsv_file:
new_tsv_file.writelines(file_contents)


def main():
filename = sys.argv[1] # 1st argument should be path/to/filename.tsv
file_contents = get_file_contents(filename)
parsable_file_contents = create_parsable_file_contents(file_contents)
create_parsable_file(parsable_file_contents, filename)


if __name__ == '__main__':
main()
19 changes: 19 additions & 0 deletions create_parsable_tsv_notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### Script Usage Notes

Usage: `python3 convert_to_tsv.py path/to/file`

Creates a new file in the same folder as the original data called filename_parsable.tsv
(note: filename must end with ".tsv")

Anomalies found in the original data.tsv:
* Null characters had to be removed
* Unnecessary newline characters within lines of data had to be dealt with
* In all cases, data was concatenated across lines
* Some names included more or less than first_name/last_name. For example:
* Adena Hobbs Bosley (line 29) - name became "Adena Hobbs"
* Boris Harrington Harrington (line 217) - name became "Boris Harrington"
* Copeland (line 302) - name became "Copeland '\t'"

A Redshift data creation script is also included.
The `upload_to_redshoft.py` file first pushes the data to AWS/S3, then copies it to Redshift.
* Note: credentials must first be updated in the file before running.
47 changes: 47 additions & 0 deletions test_convert_to_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unittest import TestCase
import create_parsable_tsv


class test_create_parsable_tsv(TestCase):
def test_get_file_contents(self):
filename = "./data/data.tsv"
contents = create_parsable_tsv.get_file_contents(filename)

expected_length = 1008
actual_length = len(contents)

self.assertEqual(expected_length, actual_length)

def test_clean_line_line1(self):
filename = "./data/data.tsv"
input = create_parsable_tsv.get_file_contents(filename)[1]

expectedOutput = "1\tAddison\tMarks\t196296\[email protected]"
actualOutput = create_parsable_tsv.clean_line(input)

self.assertEqual(expectedOutput, actualOutput)

def test_clean_line_line613(self):
filename = "./data/data.tsv"
input = create_parsable_tsv.get_file_contents(filename)[618]

expectedOutput = "613\t'\\t'\t'\\t'\t104969\[email protected]"
actualOutput = create_parsable_tsv.clean_line(input)

self.assertEqual(expectedOutput, actualOutput)

def test_fix_line_item_issues(self):
item = "29\tAdena\tHobbs\tBosley\t656184\[email protected]"
expectedOutput = "29\tAdena\tHobbs\t656184\[email protected]"
actualOutput = create_parsable_tsv.fix_line_item_issues(item)

self.assertEqual(expectedOutput, actualOutput)

def test_create_parsable_file_contents(self):
filename = "./data/data.tsv"
input = create_parsable_tsv.get_file_contents(filename)[:3]

expectedOutput = ['id\tfirst_name\tlast_name\taccount_number\temail\n', '1\tAddison\tMarks\t196296\[email protected]\n', '2\tDakota\tGarza\t409025\[email protected]']
actualOutput = create_parsable_tsv.create_parsable_file_contents(input)

self.assertEqual(expectedOutput, actualOutput)
107 changes: 107 additions & 0 deletions upload_to_redshift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import psycopg2

"""
Hardcoded list of data types in data.tsv
"""
def get_data_types():
# id first_name last_name account_number email
return ['smallint', 'varchar', 'varchar', 'smallint', 'varchar']


"""
Grabs all data from tsv file
"""
def get_all_data(filepath):
with open(filepath, 'r') as datafile:
return datafile.readlines()


"""
Goes through data to find longest value for each data type

Used for table creation, specifically for varchar types
"""
def find_varchar_lengths(data):
lengths = [0,0,0,0,0]
for line in data:
vals = line.split('\t')
for i in range(len(vals)):
val_length = len(vals[i])
if val_length > lengths[i]:
lengths[i] = val_length
return lengths


"""
Builds create table statement to eventually read data into redshift
"""
def create_table(headers, types, lengths):
statement = "create table user_data ("
for i in range(len(types)):
if types[i] == 'varchar':
statement = (statement + '\n{} varchar({})').format(headers[i].lower(), lengths[i])
else:
statement = (statement + '\n{} {},'.format(headers[i].lower(), types[i]))
return statement[:-1] + ');'


"""
Put data on AWS/S3 to push to redshift
"""
def set_up_data_on_s3(statement, host, user, port, password):
# set up connection

connection = psycopg2.connect(
host=host,
user=user,
port=port,
password=password,
dbname='tsv_data_db')

cursor = connection.cursor()

cursor.execute(statement)
connection.commit()

return connection, cursor


"""
Push data from AWS/S3 to redshift
"""
def push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key):
sql = """copy tsv_data from '""" + filepath + """'
access_key_id '""" + access_key_id + """'
secret_access_key '""" + secret_access_key + """'
region 'us-west-1'
ignoreheader 1
null as 'NA'
removequotes
delimiter '\t';"""

s3_cursor.execute(sql)
s3_connection.commit()


def main():
filepath = "./data/data_parsable.tsv"
data = get_all_data(filepath)
headers = data[0].replace('\n', '').split('\t')
types = get_data_types()
lengths = find_varchar_lengths(data)

statement = create_table(headers, types, lengths)

# note: update these values with relevant credentials
host = 'mydb.mydatabase.us-west-2.redshift.amazonaws.com'
user = 'user'
port = 1234
password = 'password'
access_key_id = '<access_key_id>'
secret_access_key = '<secret_access_key>'

s3_connection, s3_cursor = set_up_data_on_s3(statement, host, user, port, password)
push_to_redshift(s3_connection, s3_cursor, filepath, access_key_id, secret_access_key)

if __name__ == '__main__':
main()