Replace usage of deprecated DataFrame.append method (#495)

michaelnchin · web-flow · commit b23000271893 · 2023-06-01T18:20:31.000-07:00
* Replace deprecated Pandas append method

* update changelog

---------

Co-authored-by: Michael Chin &lt;chnmch@amazon.com&gt;
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -11,6 +11,7 @@ Starting with v1.31.6, this file will contain a record of major features and upd
 - Added support for setting `%graph_notebook_vis_options` from a variable ([Link to PR](https://github.com/aws/graph-notebook/pull/487))
 - Pinned JupyterLab<4.x to fix Python 3.8/3.10 builds ([Link to PR](https://github.com/aws/graph-notebook/pull/490))
 - Changed datatype of "amount" from String to numeric for "Transaction" vertices in Fraud Graph sample notebook ([Link to PR](https://github.com/aws/graph-notebook/pull/489))
+- Replaced usages of deprecated DataFrame.append method in ML samples([Link to PR](https://github.com/aws/graph-notebook/pull/495))
 
 ## Release 3.8.1 (April 17, 2023)
 - Reinstate Python 3.7 support for compatibility with legacy AL1 Neptune Notebooks ([Link to PR](https://github.com/aws/graph-notebook/pull/479))
diff --git a/src/graph_notebook/notebooks/04-Machine-Learning/Sample-Applications/01-People-Analytics/People-Analytics-using-Neptune-ML.ipynb b/src/graph_notebook/notebooks/04-Machine-Learning/Sample-Applications/01-People-Analytics/People-Analytics-using-Neptune-ML.ipynb
@@ -332,6 +332,12 @@
     "role_dept_list = []\n",
     "role_field_list = []\n",
     "\n",
+    "edge_emp_dept_rows_list = [edge_emp_dept]\n",
+    "edge_emp_role_rows_list = [edge_emp_role]\n",
+    "edge_emp_field_rows_list = [edge_emp_field]\n",
+    "edge_role_dept_rows_list = [edge_role_dept]\n",
+    "edge_role_field_rows_list = [edge_role_field]\n",
+    "\n",
     "for index, row in df.iterrows():\n",
     "    emp = row['EmployeeNumber']\n",
     "    emp_id = emp_map[emp]\n",
@@ -341,30 +347,52 @@
     "    field_id = field_map[field]\n",
     "    dept = row['Department']\n",
     "    dept_id = dept_map[dept]\n",
-    "    \n",
-    "    edge_emp_dept = edge_emp_dept.append({'~id': uuid.uuid4(), '~from': emp_id, \n",
-    "                              '~to': dept_id, \n",
-    "                              '~label': 'works_in'}, ignore_index=True)\n",
-    "    edge_emp_role = edge_emp_role.append({'~id': uuid.uuid4(), '~from': emp_id, \n",
-    "                              '~to': role_id, \n",
-    "                              '~label': 'works_as'}, ignore_index=True)\n",
-    "    edge_emp_field = edge_emp_field.append({'~id': uuid.uuid4(), '~from': emp_id, \n",
-    "                              '~to': field_id, \n",
-    "                              '~label': 'has_education_level'}, ignore_index=True)\n",
+    "\n",
+    "    edge_emp_dept_row_df = pd.DataFrame.from_dict({'~id': uuid.uuid4(),\n",
+    "                                                   '~from': emp_id,\n",
+    "                                                   '~to': dept_id,\n",
+    "                                                   '~label': 'works_in'},\n",
+    "                                                  orient='index').T\n",
+    "    edge_emp_dept_rows_list.append(edge_emp_dept_row_df)\n",
+    "    edge_emp_role_row_df = pd.DataFrame.from_dict({'~id': uuid.uuid4(),\n",
+    "                                                   '~from': emp_id,\n",
+    "                                                   '~to': role_id,\n",
+    "                                                   '~label': 'works_as'},\n",
+    "                                                  orient='index').T\n",
+    "    edge_emp_role_rows_list.append(edge_emp_role_row_df)\n",
+    "    edge_emp_field_row_df = pd.DataFrame.from_dict({'~id': uuid.uuid4(),\n",
+    "                                                    '~from': emp_id,\n",
+    "                                                    '~to': field_id,\n",
+    "                                                    '~label': 'has_education_level'},\n",
+    "                                                   orient='index').T\n",
+    "    edge_emp_field_rows_list.append(edge_emp_field_row_df)\n",
     "    \n",
     "    role_dept = f\"{role_id}-{dept_id}\"\n",
     "    role_field = f\"{role_id}-{field_id}\"\n",
     "    if role_dept not in role_dept_list:\n",
-    "        edge_role_dept = edge_role_dept.append({'~id': uuid.uuid4(), '~from': role_id, \n",
-    "                              '~to': dept_id, \n",
-    "                              '~label': 'part_of'}, ignore_index=True)\n",
+    "        edge_role_dept_row_df = pd.DataFrame.from_dict({'~id': uuid.uuid4(),\n",
+    "                                                        '~from': role_id,\n",
+    "                                                        '~to': dept_id,\n",
+    "                                                        '~label': 'part_of'},\n",
+    "                                                       orient='index').T\n",
+    "        edge_role_dept_rows_list.append(edge_role_dept_row_df)\n",
+    "        #edge_role_dept = pd.concat([edge_role_dept, edge_role_dept_row_df], ignore_index=True)\n",
     "        role_dept_list.append(role_dept)\n",
     "    if role_field not in role_field_list:\n",
-    "        edge_role_field = edge_role_field.append({'~id': uuid.uuid4(), '~from': role_id, \n",
-    "                              '~to': field_id, \n",
-    "                              '~label': 'requires'}, ignore_index=True)\n",
+    "        edge_role_field_row_df = pd.DataFrame.from_dict({'~id': uuid.uuid4(), '~from': role_id,\n",
+    "                                                         '~to': field_id,\n",
+    "                                                         '~label': 'requires'},\n",
+    "                                                        orient='index').T\n",
+    "        edge_role_field_rows_list.append(edge_role_field_row_df)\n",
     "        role_field_list.append(role_field)\n",
     "    edge_cnt = edge_cnt + 1\n",
+    "\n",
+    "edge_emp_dept = pd.concat(edge_emp_dept_rows_list, ignore_index=True)\n",
+    "edge_emp_role = pd.concat(edge_emp_role_rows_list, ignore_index=True)\n",
+    "edge_emp_field = pd.concat(edge_emp_field_rows_list, ignore_index=True)\n",
+    "edge_role_dept = pd.concat(edge_role_dept_rows_list, ignore_index=True)\n",
+    "edge_role_field = pd.concat(edge_role_field_rows_list, ignore_index=True)\n",
+    "\n",
     "edge_df = pd.concat([edge_emp_dept, edge_emp_role, edge_emp_field, edge_role_dept,  edge_role_field])\n",
     "edge_df.to_csv(os.path.join(output_folder, 'edge.csv'), index=False)\n",
     "\n",
diff --git a/src/graph_notebook/notebooks/04-Machine-Learning/neptune_ml_utils.py b/src/graph_notebook/notebooks/04-Machine-Learning/neptune_ml_utils.py
@@ -360,18 +360,23 @@ def __process_movies_genres(self):
         genre_df['name'] = genre_df['~id']
         genre_df.to_csv(os.path.join(self.formatted_directory,
                                      'genre_vertex.csv'), index=False)
+        genres_edge_df_rows_list = [genres_edges_df]
 
         # Loop through all the movies and pull out the genres
         for index, row in movie_genre_df.iterrows():
             genre_lst = []
             for g in genres:
                 if row[g] == 1:
-                    genres_edges_df = genres_edges_df.append(
-                        {'~id': f"{row['~id']}-included_in-{g}", '~label': 'included_in',
-                         '~from': row['~id'], '~to': g}, ignore_index=True)
+                    row_as_df = pd.DataFrame.from_dict({'~id': f"{row['~id']}-included_in-{g}",
+                                                        '~label': 'included_in',
+                                                        '~from': row['~id'],
+                                                        '~to': g},
+                                                       orient='index').T
+                    genres_edge_df_rows_list.append(row_as_df)
                     genre_lst.append(g)
             movies_df.loc[index, 'genre:String[]'] = ';'.join(genre_lst)
 
+        genres_edges_df = pd.concat(genres_edge_df_rows_list, ignore_index=True)
         # rename the release data column to specify the data type
         movies_df['release_date:Date'] = movies_df['release_date']
         # Drop the genre columns as well as the uneeded release date columns