diff --git a/.DS_Store b/.DS_Store index 01222e6..c9e9a64 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/__pycache__/cleaning.cpython-312.pyc b/__pycache__/cleaning.cpython-312.pyc new file mode 100644 index 0000000..5a31657 Binary files /dev/null and b/__pycache__/cleaning.cpython-312.pyc differ diff --git a/cleaning.py b/cleaning.py new file mode 100644 index 0000000..6fc5329 --- /dev/null +++ b/cleaning.py @@ -0,0 +1,151 @@ +import pandas as pd +import numpy as np + +""" +Safe Cleaning Pipeline: +1. Standardize column names +2. Remove empty rows and columns +3. Remove duplicates +4. Strip and lowercase text columns +5. Clean specific columns (gender, education, policy type, state names, etc.) +6. Clean and convert numeric columns +7. Handle missing values +""" + +def clean_column_names(df: pd.DataFrame) -> pd.DataFrame: + """Standardize column names: lowercase with underscores.""" + df.columns = ( + df.columns.str.strip() + .str.lower() + .str.replace(r'[^a-z0-9]+', '_', regex=True) + .str.replace(r'_+', '_', regex=True) + .str.strip('_') + ) + return df + +def rename_columns(df: pd.DataFrame) -> pd.DataFrame: + """Rename specific columns for consistency.""" + df = df.rename(columns={'st': 'state'}) + return df + +def remove_empty_rows_columns(df: pd.DataFrame) -> pd.DataFrame: + """Remove fully empty rows and columns.""" + return df.dropna(axis=0, how='all').dropna(axis=1, how='all') + +def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """Remove duplicate rows.""" + return df.drop_duplicates() + +def strip_text_columns(df: pd.DataFrame) -> pd.DataFrame: + """Strip whitespace and convert text columns to lowercase.""" + for col in df.select_dtypes(include='object'): + df[col] = df[col].astype(str).str.strip().str.lower() + return df + +def clean_gender(df: pd.DataFrame) -> pd.DataFrame: + """Standardize gender values to 'm' or 'f'.""" + if 'gender' in df.columns: + df['gender'] = ( + df['gender'] + .replace({ + 'male': 'm', + 'm': 'm', + 'female': 'f', + 'f': 'f', + 'femal': 'f' + }) + ) + return df + +def clean_education(df: pd.DataFrame) -> pd.DataFrame: + """Standardize education values.""" + if 'education' in df.columns: + mapping = { + 'master': 'master', + 'bachelor': 'bachelor', + 'bachelors': 'bachelor', + 'high school or below': 'high school', + 'college': 'college', + 'doctor': 'doctorate' + } + df['education'] = df['education'].replace(mapping) + return df + +def clean_state_names(df: pd.DataFrame) -> pd.DataFrame: + """Standardize state names.""" + if 'state' in df.columns: + df['state'] = df['state'].replace({ + 'az': 'arizona', + 'wa': 'washington', + 'cali': 'california' + }) + return df + +def cap_monthly_premium_auto(df: pd.DataFrame, cap: float = 1000) -> pd.DataFrame: + """Cap monthly_premium_auto at a maximum value.""" + if 'monthly_premium_auto' in df.columns: + df['monthly_premium_auto'] = np.where( + df['monthly_premium_auto'] > cap, + cap, + df['monthly_premium_auto'] + ) + return df + +def clean_policy_type(df: pd.DataFrame) -> pd.DataFrame: + """Standardize policy_type values.""" + if 'policy_type' in df.columns: + df['policy_type'] = df['policy_type'].replace({ + 'personal auto': 'personal', + 'corporate auto': 'corporate', + 'special auto': 'special' + }) + return df + +def clean_customer_lifetime_value(df: pd.DataFrame) -> pd.DataFrame: + """Clean and convert customer_lifetime_value column.""" + if 'customer_lifetime_value' in df.columns: + df['customer_lifetime_value'] = ( + df['customer_lifetime_value'] + .astype(str) + .str.replace('%', '', regex=False) + .str.strip() + ) + df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce') + return df + +def convert_data_types(df: pd.DataFrame) -> pd.DataFrame: + """Convert columns to numeric or datetime where possible.""" + for col in df.columns: + df[col] = pd.to_numeric(df[col], errors='ignore') + if df[col].dtype == 'object': + try: + df[col] = pd.to_datetime(df[col], errors='ignore') + except: + pass + return df + +def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame: + """Fill NaN values: numeric=0, text='unknown'.""" + for col in df.columns: + if pd.api.types.is_numeric_dtype(df[col]): + df[col].fillna(0, inplace=True) + else: + df[col].fillna('unknown', inplace=True) + return df + +def clean_data(df: pd.DataFrame) -> pd.DataFrame: + """Full cleaning pipeline.""" + df = clean_column_names(df) + df = rename_columns(df) + df = remove_empty_rows_columns(df) + df = remove_duplicates(df) + df = strip_text_columns(df) # strip + lowercase text before mapping + df = clean_gender(df) + df = clean_education(df) + df = clean_state_names(df) + df = cap_monthly_premium_auto(df) + df = clean_policy_type(df) + df = clean_customer_lifetime_value(df) + df = convert_data_types(df) + df = handle_missing_values(df) + return df diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..33eea52 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,379 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "id": "6063d10e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import pandas as pd\n", + "import numpy as np\n", + "import cleaning\n", + "import cleaning as cl\n", + "importlib.reload (cleaning)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountst
0rb50392washingtonnanmaster0.000.01000.01/0/00personalfour-door car2.704934washington
1qz44356arizonafbachelor697953.590.094.01/0/00personalfour-door car1131.464935arizona
2ai49188nevadafbachelor1288743.1748767.0108.01/0/00personaltwo-door car566.472247nevada
3ww63253californiambachelor764586.180.0106.01/0/00corporatesuv529.881344california
4ga49547washingtonmhigh school536307.6536357.068.01/0/00personalfour-door car17.269323washington
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value income \\\n", + "0 rb50392 washington nan master 0.00 0.0 \n", + "1 qz44356 arizona f bachelor 697953.59 0.0 \n", + "2 ai49188 nevada f bachelor 1288743.17 48767.0 \n", + "3 ww63253 california m bachelor 764586.18 0.0 \n", + "4 ga49547 washington m high school 536307.65 36357.0 \n", + "\n", + " monthly_premium_auto number_of_open_complaints policy_type vehicle_class \\\n", + "0 1000.0 1/0/00 personal four-door car \n", + "1 94.0 1/0/00 personal four-door car \n", + "2 108.0 1/0/00 personal two-door car \n", + "3 106.0 1/0/00 corporate suv \n", + "4 68.0 1/0/00 personal four-door car \n", + "\n", + " total_claim_amount st \n", + "0 2.704934 washington \n", + "1 1131.464935 arizona \n", + "2 566.472247 nevada \n", + "3 529.881344 california \n", + "4 17.269323 washington " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "\n", + "\n", + "url1 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "url2 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv'\n", + "url3 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv'\n", + "\n", + "\n", + "customers = pd.read_csv(url1)\n", + "orders = pd.read_csv(url2)\n", + "products = pd.read_csv(url3)\n", + "\n", + "\n", + "customers_clean = cl.clean_data(customers)\n", + "orders_clean = cl.clean_data(orders)\n", + "products_clean = cl.clean_data(products)\n", + "\n", + "\n", + "customers_clean['st'] = customers_clean['state'].replace({'az':'arizona', 'wa':'washington', 'cali':'california'})\n", + "\n", + " \n", + "\n", + "customers_clean['education'] = customers_clean['education'].replace({'bachelors':'bachelor', 'high school or below':'high school', 'doctor':'doctorate'})\n", + "customers_clean['education'].unique()\n", + "\n", + "customers_clean['monthly_premium_auto'] = np.where(customers_clean['monthly_premium_auto'] > 1000, 1000, customers_clean['monthly_premium_auto'])\n", + "\n", + "customers_clean.head()" ] }, { @@ -72,14 +437,381 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original 'state' column values:\n", + "0 Arizona\n", + "1 California\n", + "2 Washington\n", + "3 Oregon\n", + "4 Oregon\n", + "Name: state, dtype: object\n", + "\n", + "Cleaned data preview:\n", + " unnamed_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 dk49336 arizona 4809.216960 no basic \n", + "1 1 kx64629 california 2228.525238 no basic \n", + "2 2 lz68649 washington 14947.917300 no basic \n", + "3 3 xl78013 oregon 22332.439460 yes extended \n", + "4 4 qa50777 oregon 9025.067525 no premium \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 college 2011-02-18 employed m ... \n", + "1 college 2011-01-18 unemployed f ... \n", + "2 bachelor 2011-02-10 employed m ... \n", + "3 college 2011-01-11 employed m ... \n", + "4 bachelor 2011-01-17 medical leave f ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 corporate corporate l3 offer3 \n", + "1 1 personal personal l3 offer4 \n", + "2 2 personal personal l3 offer3 \n", + "3 2 corporate corporate l3 offer2 \n", + "4 7 personal personal l2 offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 agent 292.800000 four-door car medsize \n", + "1 call center 744.924331 four-door car medsize \n", + "2 call center 480.000000 suv medsize \n", + "3 branch 484.013411 four-door car medsize \n", + "4 branch 707.925645 four-door car medsize \n", + "\n", + " vehicle_type month \n", + "0 a 2 \n", + "1 a 1 \n", + "2 a 2 \n", + "3 a 1 \n", + "4 a 1 \n", + "\n", + "[5 rows x 27 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n" + ] + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "import cleaning as cl \n", + "\n", + "# Load the dataset\n", + "marketing_customer = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "\n", + "# Check first few 'state' values before cleaning\n", + "print(\"Original 'state' column values:\")\n", + "print(marketing_customer['state'].head())\n", + "\n", + "# Clean the dataset using your cleaning pipeline\n", + "clean_marketing_customer = cl.clean_data(marketing_customer)\n", + "\n", + "# Show cleaned data head\n", + "print(\"\\nCleaned data preview:\")\n", + "print(clean_marketing_customer.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fa135db5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Revenue by Sales Channel:\n", + " total_claim_amount\n", + "sales_channel \n", + "agent 1.810227e+06\n", + "branch 1.301204e+06\n", + "call center 9.266008e+05\n", + "web 7.066000e+05\n", + "\n", + "Average Customer Lifetime Value per Gender and Education:\n", + "education bachelor college doctorate high school master\n", + "gender \n", + "f 7874.269478 7748.823325 7328.508916 8675.220201 8157.053154\n", + "m 7703.601675 8052.459288 7415.333638 8149.687783 8168.832659\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import cleaning as cl\n", + "\n", + "# Load and clean dataset\n", + "marketing_customer = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "clean_marketing_customer = cl.clean_data(marketing_customer)\n", + "\n", + "# -------------------------------\n", + "# Task 1: Pivot Table - Total Revenue by Sales Channel\n", + "# -------------------------------\n", + "pivot_total_revenue = pd.pivot_table(\n", + " clean_marketing_customer,\n", + " values='total_claim_amount', # assuming total_claim_amount = total revenue\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ")\n", + "print(\"Total Revenue by Sales Channel:\")\n", + "print(pivot_total_revenue)\n", + "\n", + "# -------------------------------\n", + "# Task 2: Pivot Table - Avg. Customer Lifetime Value per Gender and Education\n", + "# -------------------------------\n", + "pivot_avg_clv = pd.pivot_table(\n", + " clean_marketing_customer,\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ")\n", + "print(\"\\nAverage Customer Lifetime Value per Gender and Education:\")\n", + "print(pivot_avg_clv)" ] }, { @@ -130,7 +862,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" @@ -146,7 +878,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +892,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.2" } }, "nbformat": 4,