Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions clean_tsv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
def col_len(tlist):
length = 0
for i in tlist:
length = length + len(i)
return length

#change below open path to input file location
with open("C:/Users/vwing/Desktop/data/data.tsv", "r", encoding='UTF-16LE') as in_file:
# get header and its length
data = in_file.readlines()
i = 0
while i < len(data):
# iterate through each row of data
data_list = data[i].strip().split('\t')
new_list = []
new_list.append(data_list)
length = col_len(new_list)
# correct each row of data to re-wrtie up to 5 columns
if length < 5:
while length < 5:
i = i + 1
new_list.append(data[i].strip().split('\t'))
length = col_len(new_list)
if length == 5:
if len(data[i+1].strip().split('\t')) == 1:
i = i + 1
new_list.append(data[i].strip().split('\t'))
length = col_len(new_list)
data_list = []
if col_len(new_list) == 5:
for j in new_list:
data_list = data_list + j
# if column count is over 5 it's because of an error in name columns
elif col_len(new_list) == 6:
# if multiple last names and not equal then here we will concatenate them, else assumed duplicate
if len(new_list[0]) == 3:
new_list[0][2] = new_list[0][2]+' '+new_list[1][0]
new_list[1].pop(0)
for j in new_list:
data_list = data_list + j
elif len(new_list[0]) == 2 and len(new_list[1]) == 4:
# if two equal first names
if new_list[0][-1].strip() == new_list[1][0].strip():
new_list[1].pop(0)
# if two equal last names
elif new_list[1][0].strip() == new_list[1][1].strip():
new_list[1].pop(0)
for j in new_list:
data_list = data_list + j

else:
print('*'*80)
print('exception: {0}'.format(new_list))
else:
print('*'*80)
print('exception: {0}'.format(new_list))
else:
print('*'*80)
print('exception: {0}'.format(new_list))

with open('data_utf8.tsv', 'a', encoding='UTF-8') as out_file:
line = '\t'.join(data_list)
line = line+'\n'
out_file.write(line.replace("-", '').replace('/',''))
i = i + 1
out_file.close()
in_file.close()
6 changes: 6 additions & 0 deletions clean_tsv_readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Overall the biggest challenge was determining where the error tabs were and correcting those first via the column lengths.
From there I tackled the next main issue which was the inconsistencies in names. I assumed that any deviation in column
length was due to errors in name input after reviewing the data file. If consectutive names in the same row were identical
I assumed them to be duplicates, otherwise just longer names and relegated the "third" name as part of the last name. If
there are any problems that occur in the future that do not meet this handling logic I implemented an exception handler
that will print the offending row to the console for later evaluation.
Loading