From e0bb207e82c7d397a941aa12773f1890577b0415 Mon Sep 17 00:00:00 2001 From: NodrrS Date: Sun, 3 Aug 2025 22:31:38 +0200 Subject: [PATCH 1/2] lab done --- .DS_Store | Bin 6148 -> 6148 bytes __pycache__/cleaning.cpython-312.pyc | Bin 0 -> 7336 bytes cleaning.py | 151 ++++++ lab-dw-data-structuring-and-combining.ipynb | 570 +++++++++++++++++++- 4 files changed, 712 insertions(+), 9 deletions(-) create mode 100644 __pycache__/cleaning.cpython-312.pyc create mode 100644 cleaning.py diff --git a/.DS_Store b/.DS_Store index 01222e6cfb2c441e1301af746b470bb69b3a7510..c9e9a643433af8781d0fff7d4c6ecbc4f581386c 100644 GIT binary patch delta 64 zcmZoMXfc=|#>CJ*F;Q%yo+2a5!~knXmdQL!)tgJ1C$Vg9V9sOO%+A5j0aUWNkoi0F UWPTAtMuy2YJkpzEL{=~Z06yLkSO5S3 delta 304 zcmZoMXfc=|#>B)qF;Q%yo+2a1!~pBf3XBbmKvqw6y%2*QLncE3LnT8VLlQ$gLo!1K zLn@HW28!haW%3wG7>aY!4TF>Oa|;;2K->>V5v4CT-^C>c=iHmW_JkaO=q z&JA*&9msh=&dWE}mGla}x>j`XzK`tJ{dHyQTi*|(26iA90J&W|klO`v4Lgu)0J-1} z6i(#jJ40s$WlEKkLlQTp3{4Bd6nhBMgZ%U$&#BxX z^qh(T9K^)!L0HFNOpcF<33wZdNdkw7NODFCl9#A4sVG?<+6BV~@c83U-KTU1>)+TrK$C-sD}$jV=-1Kv`t| z@Wf4_p%P zT3971#KdtiR`z>gd!@ckciDhG>f8QD{!h1rV)l#n8y1EOCvwL8b&}^ADZ9b zm$apz7GQSZHoKv^PkjSwYR|Uk&Mx)kev)Y~xccv(DSCYO-$f1Zz4q$P!F5eN@@suRZutGek&ag1y72Yya$MSN-3z6S97J|L&!EinpUJnlCgF}Vj@Otn_ zK6qq3crqV6SqPrWSTcuy!{}BJ1^Cw5wK)GI0KEwWWT{}tTG8|lg*s>9YdsS~tb%Ao z{%oJxO*3N&Yz7j2?CArQ(dV{+BMf_olTw0$Ro)lxW5JX4jrW1hjjtLV*XVIjw=K>| z0_umZG{?9=UIy@soxFgaF&2;yS8qBvuMrK54$&|)8H7g}g$lIH1m+Jdo>(}MXF9*} z`ZIU47hd|BvR8|3yNk_j#g0x~w})PN94-D0%Ha>_b_g7qA7#CI6LIoecMwJ~QRo;D z+bWbTsm#XPc?ZvY>cG3GzKUn*nrd~XJ1f;H@F{Q_qMRscHln%8EX@%IoC-*Dj&ZRG zL6TA0Ga=rdV3pW}EK4MgOAyyJ7ca+D8Q}opKySl3&=IIKmno*Cn=0h6)yAN=X>=5~ zau~wsEqDo+b^sISl1T{9Rl6WySf)HZbP^;mT$FQAVdDhn#}+3SCi4vYO-om?t*6+& zx43V(*vb}L_Pp{jjqVJi@1k~h=59P3TJIRncML!4D|DOy)St`->Vzo=^iAx8_u01g z*}a`Li0!~~pOyo471$l=wo0pzU~FVs5!3_&YYd0Pw4C5H_cW(WfNE4Zp}Aw6Btke( zh+NpJS-7-jndUSb<`Q*o0e6F;LehOH)4{;T5m>?6#7gwf_beV%!D} zOE}*WE(UiOgI&cY_7!96cW+R(RuAaySk|MvH<1mli%Z6^>oQAOeG|ahR(*R_tY%Vb zxqDN&v-H$i@DOF_n|X4TIxXpeQ7*~G<%BvR%|w$B=)_b!%B8?>a4O5OaR~VFRNMf4 zL~E+mGHfMoLbD_wJDr{okWhjrf(cMn&}@TcM{D(g-i4;`yAq&D z8Ip~eC=Oy}$Kk19YEFo53gA`JTyQ8z66c^XBp2nRur=*7zq8mVX@9wq@$N0PByZbd zxnfZC9_aW(tn5=IWjEz4kK7O4d8Vfr>dM{AhX&U}NAjT~#m>HBSGXAJ1u!4OVD7|V zK9zU2*Hv@*5cwMV-WJG`3Dy5?Zew_mCw3F?wKmD*wmFUwTveQJkTQvrjyH-qci!=Y zG1KmsJ)kK)5qRXP+)DXI*og($cqK^5adl*H*$h7{3S>om~lUX413a17KOF&Pe& zaPZR{0zxuU!i*87aXM{eN+e+8>1&oGuent@nn=N@Vr+|}>ogo)ong^6=mR?>B>@%Q zmF&`)$LE*N=a~qE^|io(yf^&P*O>j~C!-(VclgV(&! z{JU1&1tEUTvoG)L{oYH3`=9z&eb2_$j$g_jy}Wjle||u~TNrlEaZ3jN@)i>ep_Cwp%cvzJ;UJh)_J z&wDAFx<);?cGr51nx@0H^wCk-NSjLsf1JLREQgZ;II9!TK(Y-^mQBivB92J{q;=x$ z1l%Z~D=-Pv169~=K;kN9V1K9CNk*-C01IX}Xg_?Ya?in-F+Og>5vQdf>?c&*a0)7b zK%ikRolSp7Iqc0D>x*F1;^4yIl4a@eGV{#&Ja`;u##sg|n2BYN&7I1g%Kfb1*{55q zzUCJlp+}L2k(J=e=xXbiZO=Q-%v3<7zEwJ3+0*^!L`h;Vau-YSdICQ~500x8;{T6gIPr$u^Bt+vP;4X37a9s)- zhc15&kAjg>0T7%0Z2$-y?vG}V&W&V8);$OEo&$#UpL>QOdS{gEJ98Ja7jwf)eCFcc zy#3%)%Dm$%gm+kXK}U6?4Nisdv#}5EBNi#OeDN_$nRdEYu~VzK7(=8~am%OWDsDMm zRB_98ui|3wyR~0@E3}o1uMf6z;a?s#xXm@Wtu?v$JV{FB%*b((PPd+e`yQ4!elR1O z6qAA^CWJ`XN3Qe8RTaXQG>F9&I6R7+@5ng<;rxtbYb1*x5rX&#f`cvb#cjkF>`xF5 zBpG;=EL8WYf4X+(&J4fr7H)#p2`ciFfINIU=(iB97%kUt01yb?BINpGw2bQ$Vg*o_bfk z8<^DH7Wxdm)bn^?c|fNy{p`>eBcG3K5L$OyXg1gLXy76F$gUiEIvAWZ?IUJDY za6sx_9}IVC@#ex!odR-c?6I^gZD3OG@`9A*k@uljr+{1uJ`JsgHZZA&obahM7Levx zK)z~%3NruDa!99ueD+iF^M^}+y7|@34b0JTi`v3^liQqvP67E0eKGs_>;@+Fwry0# sK!; pd.DataFrame: + """Standardize column names: lowercase with underscores.""" + df.columns = ( + df.columns.str.strip() + .str.lower() + .str.replace(r'[^a-z0-9]+', '_', regex=True) + .str.replace(r'_+', '_', regex=True) + .str.strip('_') + ) + return df + +def rename_columns(df: pd.DataFrame) -> pd.DataFrame: + """Rename specific columns for consistency.""" + df = df.rename(columns={'st': 'state'}) + return df + +def remove_empty_rows_columns(df: pd.DataFrame) -> pd.DataFrame: + """Remove fully empty rows and columns.""" + return df.dropna(axis=0, how='all').dropna(axis=1, how='all') + +def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """Remove duplicate rows.""" + return df.drop_duplicates() + +def strip_text_columns(df: pd.DataFrame) -> pd.DataFrame: + """Strip whitespace and convert text columns to lowercase.""" + for col in df.select_dtypes(include='object'): + df[col] = df[col].astype(str).str.strip().str.lower() + return df + +def clean_gender(df: pd.DataFrame) -> pd.DataFrame: + """Standardize gender values to 'm' or 'f'.""" + if 'gender' in df.columns: + df['gender'] = ( + df['gender'] + .replace({ + 'male': 'm', + 'm': 'm', + 'female': 'f', + 'f': 'f', + 'femal': 'f' + }) + ) + return df + +def clean_education(df: pd.DataFrame) -> pd.DataFrame: + """Standardize education values.""" + if 'education' in df.columns: + mapping = { + 'master': 'master', + 'bachelor': 'bachelor', + 'bachelors': 'bachelor', + 'high school or below': 'high school', + 'college': 'college', + 'doctor': 'doctorate' + } + df['education'] = df['education'].replace(mapping) + return df + +def clean_state_names(df: pd.DataFrame) -> pd.DataFrame: + """Standardize state names.""" + if 'state' in df.columns: + df['state'] = df['state'].replace({ + 'az': 'arizona', + 'wa': 'washington', + 'cali': 'california' + }) + return df + +def cap_monthly_premium_auto(df: pd.DataFrame, cap: float = 1000) -> pd.DataFrame: + """Cap monthly_premium_auto at a maximum value.""" + if 'monthly_premium_auto' in df.columns: + df['monthly_premium_auto'] = np.where( + df['monthly_premium_auto'] > cap, + cap, + df['monthly_premium_auto'] + ) + return df + +def clean_policy_type(df: pd.DataFrame) -> pd.DataFrame: + """Standardize policy_type values.""" + if 'policy_type' in df.columns: + df['policy_type'] = df['policy_type'].replace({ + 'personal auto': 'personal', + 'corporate auto': 'corporate', + 'special auto': 'special' + }) + return df + +def clean_customer_lifetime_value(df: pd.DataFrame) -> pd.DataFrame: + """Clean and convert customer_lifetime_value column.""" + if 'customer_lifetime_value' in df.columns: + df['customer_lifetime_value'] = ( + df['customer_lifetime_value'] + .astype(str) + .str.replace('%', '', regex=False) + .str.strip() + ) + df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce') + return df + +def convert_data_types(df: pd.DataFrame) -> pd.DataFrame: + """Convert columns to numeric or datetime where possible.""" + for col in df.columns: + df[col] = pd.to_numeric(df[col], errors='ignore') + if df[col].dtype == 'object': + try: + df[col] = pd.to_datetime(df[col], errors='ignore') + except: + pass + return df + +def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame: + """Fill NaN values: numeric=0, text='unknown'.""" + for col in df.columns: + if pd.api.types.is_numeric_dtype(df[col]): + df[col].fillna(0, inplace=True) + else: + df[col].fillna('unknown', inplace=True) + return df + +def clean_data(df: pd.DataFrame) -> pd.DataFrame: + """Full cleaning pipeline.""" + df = clean_column_names(df) + df = rename_columns(df) + df = remove_empty_rows_columns(df) + df = remove_duplicates(df) + df = strip_text_columns(df) # strip + lowercase text before mapping + df = clean_gender(df) + df = clean_education(df) + df = clean_state_names(df) + df = cap_monthly_premium_auto(df) + df = clean_policy_type(df) + df = clean_customer_lifetime_value(df) + df = convert_data_types(df) + df = handle_missing_values(df) + return df diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..f6b6bec 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,379 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "id": "6063d10e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import pandas as pd\n", + "import numpy as np\n", + "import cleaning\n", + "import cleaning as cl\n", + "importlib.reload (cleaning)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amountst
0rb50392washingtonnanmaster0.000.01000.01/0/00personalfour-door car2.704934washington
1qz44356arizonafbachelor697953.590.094.01/0/00personalfour-door car1131.464935arizona
2ai49188nevadafbachelor1288743.1748767.0108.01/0/00personaltwo-door car566.472247nevada
3ww63253californiambachelor764586.180.0106.01/0/00corporatesuv529.881344california
4ga49547washingtonmhigh school536307.6536357.068.01/0/00personalfour-door car17.269323washington
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value income \\\n", + "0 rb50392 washington nan master 0.00 0.0 \n", + "1 qz44356 arizona f bachelor 697953.59 0.0 \n", + "2 ai49188 nevada f bachelor 1288743.17 48767.0 \n", + "3 ww63253 california m bachelor 764586.18 0.0 \n", + "4 ga49547 washington m high school 536307.65 36357.0 \n", + "\n", + " monthly_premium_auto number_of_open_complaints policy_type vehicle_class \\\n", + "0 1000.0 1/0/00 personal four-door car \n", + "1 94.0 1/0/00 personal four-door car \n", + "2 108.0 1/0/00 personal two-door car \n", + "3 106.0 1/0/00 corporate suv \n", + "4 68.0 1/0/00 personal four-door car \n", + "\n", + " total_claim_amount st \n", + "0 2.704934 washington \n", + "1 1131.464935 arizona \n", + "2 566.472247 nevada \n", + "3 529.881344 california \n", + "4 17.269323 washington " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "\n", + "\n", + "url1 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "url2 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv'\n", + "url3 = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv'\n", + "\n", + "\n", + "customers = pd.read_csv(url1)\n", + "orders = pd.read_csv(url2)\n", + "products = pd.read_csv(url3)\n", + "\n", + "\n", + "customers_clean = cl.clean_data(customers)\n", + "orders_clean = cl.clean_data(orders)\n", + "products_clean = cl.clean_data(products)\n", + "\n", + "\n", + "customers_clean['st'] = customers_clean['state'].replace({'az':'arizona', 'wa':'washington', 'cali':'california'})\n", + "\n", + " \n", + "\n", + "customers_clean['education'] = customers_clean['education'].replace({'bachelors':'bachelor', 'high school or below':'high school', 'doctor':'doctorate'})\n", + "customers_clean['education'].unique()\n", + "\n", + "customers_clean['monthly_premium_auto'] = np.where(customers_clean['monthly_premium_auto'] > 1000, 1000, customers_clean['monthly_premium_auto'])\n", + "\n", + "customers_clean.head()" ] }, { @@ -72,14 +437,201 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original 'state' column values:\n", + "0 Arizona\n", + "1 California\n", + "2 Washington\n", + "3 Oregon\n", + "4 Oregon\n", + "Name: state, dtype: object\n", + "\n", + "Cleaned data preview:\n", + " unnamed_0 customer state customer_lifetime_value response coverage \\\n", + "0 0 dk49336 arizona 4809.216960 no basic \n", + "1 1 kx64629 california 2228.525238 no basic \n", + "2 2 lz68649 washington 14947.917300 no basic \n", + "3 3 xl78013 oregon 22332.439460 yes extended \n", + "4 4 qa50777 oregon 9025.067525 no premium \n", + "\n", + " education effective_to_date employmentstatus gender ... \\\n", + "0 college 2011-02-18 employed m ... \n", + "1 college 2011-01-18 unemployed f ... \n", + "2 bachelor 2011-02-10 employed m ... \n", + "3 college 2011-01-11 employed m ... \n", + "4 bachelor 2011-01-17 medical leave f ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 corporate corporate l3 offer3 \n", + "1 1 personal personal l3 offer4 \n", + "2 2 personal personal l3 offer3 \n", + "3 2 corporate corporate l3 offer2 \n", + "4 7 personal personal l2 offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 agent 292.800000 four-door car medsize \n", + "1 call center 744.924331 four-door car medsize \n", + "2 call center 480.000000 suv medsize \n", + "3 branch 484.013411 four-door car medsize \n", + "4 branch 707.925645 four-door car medsize \n", + "\n", + " vehicle_type month \n", + "0 a 2 \n", + "1 a 1 \n", + "2 a 2 \n", + "3 a 1 \n", + "4 a 1 \n", + "\n", + "[5 rows x 27 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n" + ] + } + ], "source": [ - "# Your code goes here" + "import pandas as pd\n", + "import cleaning as cl \n", + "\n", + "# Load the dataset\n", + "marketing_customer = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "\n", + "# Check first few 'state' values before cleaning\n", + "print(\"Original 'state' column values:\")\n", + "print(marketing_customer['state'].head())\n", + "\n", + "# Clean the dataset using your cleaning pipeline\n", + "clean_marketing_customer = cl.clean_data(marketing_customer)\n", + "\n", + "# Show cleaned data head\n", + "print(\"\\nCleaned data preview:\")\n", + "print(clean_marketing_customer.head())" ] }, { @@ -130,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" @@ -146,7 +698,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "base", "language": "python", "name": "python3" }, @@ -160,7 +712,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.2" } }, "nbformat": 4, From c81bdddfdf67121ea632bee97600772023fedcd9 Mon Sep 17 00:00:00 2001 From: NodrrS Date: Sun, 3 Aug 2025 22:34:27 +0200 Subject: [PATCH 2/2] ldone --- lab-dw-data-structuring-and-combining.ipynb | 184 +++++++++++++++++++- 1 file changed, 182 insertions(+), 2 deletions(-) diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index f6b6bec..33eea52 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -437,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", "metadata": { "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" @@ -634,6 +634,186 @@ "print(clean_marketing_customer.head())" ] }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fa135db5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total Revenue by Sales Channel:\n", + " total_claim_amount\n", + "sales_channel \n", + "agent 1.810227e+06\n", + "branch 1.301204e+06\n", + "call center 9.266008e+05\n", + "web 7.066000e+05\n", + "\n", + "Average Customer Lifetime Value per Gender and Education:\n", + "education bachelor college doctorate high school master\n", + "gender \n", + "f 7874.269478 7748.823325 7328.508916 8675.220201 8157.053154\n", + "m 7703.601675 8052.459288 7415.333638 8149.687783 8168.832659\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:122: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_datetime without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_datetime(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:119: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", + " df[col] = pd.to_numeric(df[col], errors='ignore')\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:131: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna(0, inplace=True)\n", + "/Users/nodirbeksaidov/Desktop/labs/week 2/lab-dw-data-structuring-and-combining/cleaning.py:133: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[col].fillna('unknown', inplace=True)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import cleaning as cl\n", + "\n", + "# Load and clean dataset\n", + "marketing_customer = pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv')\n", + "clean_marketing_customer = cl.clean_data(marketing_customer)\n", + "\n", + "# -------------------------------\n", + "# Task 1: Pivot Table - Total Revenue by Sales Channel\n", + "# -------------------------------\n", + "pivot_total_revenue = pd.pivot_table(\n", + " clean_marketing_customer,\n", + " values='total_claim_amount', # assuming total_claim_amount = total revenue\n", + " index='sales_channel',\n", + " aggfunc='sum'\n", + ")\n", + "print(\"Total Revenue by Sales Channel:\")\n", + "print(pivot_total_revenue)\n", + "\n", + "# -------------------------------\n", + "# Task 2: Pivot Table - Avg. Customer Lifetime Value per Gender and Education\n", + "# -------------------------------\n", + "pivot_avg_clv = pd.pivot_table(\n", + " clean_marketing_customer,\n", + " values='customer_lifetime_value',\n", + " index='gender',\n", + " columns='education',\n", + " aggfunc='mean'\n", + ")\n", + "print(\"\\nAverage Customer Lifetime Value per Gender and Education:\")\n", + "print(pivot_avg_clv)" + ] + }, { "cell_type": "markdown", "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", @@ -682,7 +862,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4"