|  | 
|  | 1 | +#!/usr/bin/env python3 | 
|  | 2 | +import argparse | 
|  | 3 | +import re | 
|  | 4 | + | 
|  | 5 | +import pandas as pd | 
|  | 6 | +from google.cloud import bigquery | 
|  | 7 | + | 
|  | 8 | + | 
|  | 9 | +def upload_dataframe_to_bigquery(df, project_id, dataset_name, table_name): | 
|  | 10 | +    client = bigquery.Client(project=project_id) | 
|  | 11 | + | 
|  | 12 | +    dataset_ref = bigquery.dataset.DatasetReference(project_id, dataset_name) | 
|  | 13 | +    table_ref = bigquery.table.TableReference(dataset_ref, table_name) | 
|  | 14 | + | 
|  | 15 | +    # dump the csv into bigquery | 
|  | 16 | +    job = client.load_table_from_dataframe(df, table_ref) | 
|  | 17 | + | 
|  | 18 | +    job.result() | 
|  | 19 | + | 
|  | 20 | +    print( | 
|  | 21 | +        "Loaded {} rows into {}:{}.".format(job.output_rows, dataset_name, table_name) | 
|  | 22 | +    ) | 
|  | 23 | + | 
|  | 24 | + | 
|  | 25 | +def sanitize_column_name(column_name): | 
|  | 26 | +    regex = re.compile("[^a-zA-Z]") | 
|  | 27 | +    return regex.sub("", column_name) | 
|  | 28 | + | 
|  | 29 | + | 
|  | 30 | +def sanitize_column_names(df): | 
|  | 31 | +    sanitized_columns = {} | 
|  | 32 | +    for column in df.columns: | 
|  | 33 | +        sanitized_column = sanitize_column_name(column) | 
|  | 34 | +        sanitized_columns[column] = sanitized_column | 
|  | 35 | + | 
|  | 36 | +    return df.rename(columns=sanitized_columns) | 
|  | 37 | + | 
|  | 38 | + | 
|  | 39 | +def main(): | 
|  | 40 | +    """ | 
|  | 41 | +    Commandline entrypoint | 
|  | 42 | +    """ | 
|  | 43 | +    parser = argparse.ArgumentParser( | 
|  | 44 | +        description="Sanitize ticket CSV and upload to BigQuery" | 
|  | 45 | +    ) | 
|  | 46 | + | 
|  | 47 | +    parser.add_argument( | 
|  | 48 | +        "csv_file", type=str, help="Ticket CSV file", | 
|  | 49 | +    ) | 
|  | 50 | + | 
|  | 51 | +    parser.add_argument("-p", "--project-id", help="BigQuery project ID") | 
|  | 52 | + | 
|  | 53 | +    parser.add_argument( | 
|  | 54 | +        "-d", "--dataset-name", help="BigQuery dataset name to create or append" | 
|  | 55 | +    ) | 
|  | 56 | + | 
|  | 57 | +    parser.add_argument( | 
|  | 58 | +        "-t", "--table-name", help="BigQuery table name to create or append" | 
|  | 59 | +    ) | 
|  | 60 | + | 
|  | 61 | +    args = parser.parse_args() | 
|  | 62 | + | 
|  | 63 | +    # load the csv into bigquery | 
|  | 64 | +    with open(args.csv_file, "rb") as source_file: | 
|  | 65 | +        df = pd.read_csv(args.csv_file) | 
|  | 66 | +        sanitized_df = sanitize_column_names(df) | 
|  | 67 | +        upload_dataframe_to_bigquery( | 
|  | 68 | +            sanitized_df, args.project_id, args.dataset_name, args.table_name | 
|  | 69 | +        ) | 
|  | 70 | + | 
|  | 71 | + | 
|  | 72 | +if __name__ == "__main__": | 
|  | 73 | +    main() | 
0 commit comments