From 4db5e609c684cc4b8477683064d7d3b3e1afbf65 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 15:22:08 -0700 Subject: [PATCH 01/15] Add files via upload --- ...lar-Structures-as-Graph-Data-Gremlin.ipynb | 623 ++++++++++++++++++ .../04-Life-Science-Applications/__init__.py | 0 2 files changed, 623 insertions(+) create mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb create mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb new file mode 100644 index 00000000..f6081925 --- /dev/null +++ b/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01135e5b", + "metadata": {}, + "source": [ + "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. BSD 3-Clause \"New\" or \"Revised\" License" + ] + }, + { + "cell_type": "markdown", + "id": "e147f141", + "metadata": {}, + "source": [ + "# Model molecular SMILES data with Amazon Neptune and RDKit" + ] + }, + { + "cell_type": "markdown", + "id": "3b87be86", + "metadata": {}, + "source": [ + "This notebook walks through the process of modeling chemical structures using Amazon Neptune. It includes transforming a chemical compound represented as a SMILES string into graph data, with nodes representing individual atoms and edges representing the bonds between atoms.This notebook then closes with a visualization and exploration of the molecule caffeine." + ] + }, + { + "cell_type": "markdown", + "id": "9716ef94", + "metadata": {}, + "source": [ + " - [Background](#Background)\n", + "\n", + " - [Solution Overview](#Solution-Overview)\n", + "\n", + " - [Package Setup](#Package-Setup)\n", + "\n", + " - [Graph Data Model](#Graph-Data-Model)\n", + "\n", + " - [RDKit Processing](#RDKit-Processing)\n", + "\n", + " - [Amazon Neptune Data Upload](#Data-Upload)\n", + "\n", + " - [Basic Visualization & Queries](#Basic-Visualization-&-Queries)\n", + "\n", + " - [Clean Up](#Clean-Up)" + ] + }, + { + "cell_type": "markdown", + "id": "23409061", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "63016f6c", + "metadata": {}, + "source": [ + " Modeling chemical structures can be a complex and tedious process, even with the help of modern programs and technology. The ability to explore chemical structures at the most fundamental level of atoms and the bonds that connect them is an essential process in [drug discovery, pharmaceutical research](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00947), and [chemical engineering](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8130509/). By infusing chemical research with technology, researchers can expedite outcome timelines, identify hidden relationships, and overall simplify a traditionally complex process. " + ] + }, + { + "cell_type": "markdown", + "id": "a9bfdfa6", + "metadata": {}, + "source": [ + "## Solution Overview" + ] + }, + { + "cell_type": "markdown", + "id": "33e4d4ac", + "metadata": {}, + "source": [ + "In order to integrate technology into the analysis of chemical structures, molecules themselves must first be represented in a machine-readable format, such as [SMILES (simplified molecular-input line-entry system)](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system). **SMILES** format strings are the industry standard in representing molecular structures. The SMILES format enables the relationships between atoms in a molecular structure to be conveyed as a machine processable string. The SMILES format is not all encompassing, leaving out details such as certain polarities and bond properties. However, SMILES does enable powerful analysis at scale of different structures. \n", + "\n", + "Using Amazon Neptune and the open-source chemical-Informaics software package [RDKit](https://www.rdkit.org/), SMILES format data can be ingested, processed, and converted into nodes and edges in a property graph. Modeling molecular structures in a graph database allows for powerful custom visualization and manipulation at the scale demanded by pharmaceutical applications. Utilizing a graph database such as Neptune allows users to compare millions of molecules with millions of associated interactions. Additionally, the fully managed and serverless infrastructure allows experts with backgrounds in biology and chemistry to focus primarily on the research outcomes of their graph data, avoiding the undifferentiated heavy lifting of managing a complex graph database infrastructure.\n", + "\n", + "This walkthrough follows the process of converting a singular SMILES string, **caffeine**, to graph data in Neptune. However, the process will work for any SMILES format string you would like to use. We’re sourcing the string for caffeine from the [National Library of Medicine](https://pubchem.ncbi.nlm.nih.gov/compound/2519#section=InChI), which maintains a public dataset of many chemical structures [**CN1C=NC2=C1C(=O)N(C(=O)N2C)C**].\n" + ] + }, + { + "cell_type": "markdown", + "id": "79e81e96", + "metadata": {}, + "source": [ + "We also use the open-source cheminformatics package [RDKit](https://github.com/rdkit/rdkit), Python-based data science package [Pandas](https://pandas.pydata.org/), and [AWS SDK for pandas (awswrangler)](https://aws-sdk-pandas.readthedocs.io/en/stable/). RDKit has a strong community and a great number of chem-informatics utilities; we’ll only be exploring a small portion for this post. Pandas is an open-source Python-based data science toolkit with large community support. The AWS SDK for pandas provides a large set of tools to help AWS services interact with the pandas library." + ] + }, + { + "cell_type": "markdown", + "id": "c49cf299", + "metadata": {}, + "source": [ + "## Package Setup" + ] + }, + { + "cell_type": "markdown", + "id": "4e1c8a9d", + "metadata": {}, + "source": [ + "The first step in modeling a chemical structure as graph data is importing the required packages. Here we will be using **RDKit**, **Pandas**, and **awswrangler**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb5f3b98", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install rdkit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "440c97e6", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install awswrangler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a5625f3", + "metadata": {}, + "outputs": [], + "source": [ + "from rdkit import Chem\n", + "import pandas as pd\n", + "import awswrangler as wr" + ] + }, + { + "cell_type": "markdown", + "id": "24e2d68d", + "metadata": {}, + "source": [ + "## Graph Data Model" + ] + }, + { + "cell_type": "markdown", + "id": "536a3491", + "metadata": {}, + "source": [ + "There are a few different options for graph query languages and their associated data models when working with Neptune; in this case we’re using **Apache TinkerPop’s Gremlin**. We are opting for Gremlin here due to its intuitive nature and easy to learn syntax. The cell below is defining a dictionary object for both the nodes and edges of our graph. Within each dictionary object is a set of properties we will gather from our caffeine molecule in the next section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8cd157", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_dict = {'~id':[],\n", + " '~label':[],\n", + " 'idx':[],\n", + " 'atomicNumber':[],\n", + " 'isAromatic': []\n", + " }\n", + " \n", + "edges_dict = {'~id':[],\n", + " '~label':[],\n", + " '~from':[],\n", + " '~to':[],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "d049b9e6", + "metadata": {}, + "source": [ + "## RDKit Processing" + ] + }, + { + "cell_type": "markdown", + "id": "21846f4c", + "metadata": {}, + "source": [ + "This section is where the chemical computing magic happens. We use the `rdkit` package installed earlier in the graph notebook to decompose our chemical structure into lists of **nodes (atoms)** and **edges (bonds)**." + ] + }, + { + "cell_type": "markdown", + "id": "c69d0e0d", + "metadata": {}, + "source": [ + "First, we want to declare our SMILES string for the caffeine molecule as a variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33a9fdf0", + "metadata": {}, + "outputs": [], + "source": [ + "caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'" + ] + }, + { + "cell_type": "markdown", + "id": "cacd427b", + "metadata": {}, + "source": [ + "Next, to obtain a molecule-type object from the RDKit package, we need to use the below call to the `Chem` library from RDKit imported earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "208de8b7", + "metadata": {}, + "outputs": [], + "source": [ + "mol = Chem.MolFromSmiles(caffeine_smiles)" + ] + }, + { + "cell_type": "markdown", + "id": "45119037", + "metadata": {}, + "source": [ + "Let’s see a visual of our work so far, run the following cell to output a 2D picture of our molecule below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7260f5c8", + "metadata": {}, + "outputs": [], + "source": [ + "mol" + ] + }, + { + "cell_type": "markdown", + "id": "b1932568", + "metadata": {}, + "source": [ + "To recap what we just did, first we declared our SMILES string for caffeine as the variable `caffeine_smiles`. Next, we used the `Chem.MolFromSmiles` function from RDKit to turn the SMILES into a `mol` type object defined by RDKit. Finally, we returned the `mol` type object which resulted in a 2D image of the molecular structure for caffeine that we were working with." + ] + }, + { + "cell_type": "markdown", + "id": "0cbd5ec2", + "metadata": {}, + "source": [ + "Now we need to iterate through each atom and bond within the `mol` object outputted from RDKit. While iterating through each atom and bond, we use the graph data model we declared earlier, storing properties of each inside the data model. Feel free to dive deeper into the `mol.GetAtoms()` and `mol.GetBonds()` function calls on your own - we are only exploring a small subset of their functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b030e4", + "metadata": {}, + "outputs": [], + "source": [ + "for atom in mol.GetAtoms():\n", + " nodes_dict['~id'].append('Node-'+ caffeine_smiles + str(atom.GetIdx()))\n", + " nodes_dict['~label'].append(atom.GetSymbol())\n", + " nodes_dict['idx'].append(atom.GetIdx())\n", + " nodes_dict['atomicNumber'].append(atom.GetAtomicNum())\n", + " nodes_dict['isAromatic'].append(atom.GetIsAromatic())\n", + "\n", + "for bond in mol.GetBonds():\n", + " edges_dict['~id'].append('edge-'+ caffeine_smiles + str(bond.GetBeginAtomIdx()) + str(bond.GetEndAtomIdx()))\n", + " edges_dict['~label'].append(str(bond.GetBondType()))\n", + " edges_dict['~from'].append('Node-' + caffeine_smiles + str(bond.GetBeginAtomIdx()))\n", + " edges_dict['~to'].append('Node-' + caffeine_smiles + str(bond.GetEndAtomIdx()))" + ] + }, + { + "cell_type": "markdown", + "id": "09038090", + "metadata": {}, + "source": [ + "Several different RDKit functions are in this portion of code, so let’s break it down piece by piece: " + ] + }, + { + "cell_type": "markdown", + "id": "caeeaeb3", + "metadata": {}, + "source": [ + "•\tFor the `~id` field of both nodes and edges, we combine the data type `Node` or `Edge`, the SMILES string, and the unique index for the atom" + ] + }, + { + "cell_type": "markdown", + "id": "2e8287cf", + "metadata": {}, + "source": [ + "•\tFor the `~label` field, we use the chemical symbol for nodes, and the bond type for the edges" + ] + }, + { + "cell_type": "markdown", + "id": "19bf0671", + "metadata": {}, + "source": [ + "•\tThe fields `~from` and `~to` for the edges (bonds) are constructed by combining the prefix `Node-` with the SMILES string, and the respective beginning and ending atoms that the bond connects" + ] + }, + { + "cell_type": "markdown", + "id": "0da3d8f8", + "metadata": {}, + "source": [ + "•\tThe additional fields for the nodes (atoms) in the graph model are the atom’s unique ID within the molecule, its atomic number, and if it is aromatic or not" + ] + }, + { + "cell_type": "markdown", + "id": "3ccc7a32", + "metadata": {}, + "source": [ + "Note that you can extract several atomic properties for a given SMILES string in RDKit and add them as additional fields for a given atom or bond. We don’t list them all in this post, but you can explore additional fields for both the atoms and bonds." + ] + }, + { + "cell_type": "markdown", + "id": "bcbf50b9", + "metadata": {}, + "source": [ + "Finally, we want to use **Pandas** to transform our data-poulated dictionaries into pandas data frames" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab3a8fa7", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_df = pd.DataFrame.from_dict(nodes_dict)\n", + "edges_df = pd.DataFrame.from_dict(edges_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "35572287", + "metadata": {}, + "source": [ + "let's check the results of our work so far. Running the cells below should return data frames for both the edges and nodes of our caffeine molecule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f8df5de", + "metadata": {}, + "outputs": [], + "source": [ + "edges_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1096a312", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_df" + ] + }, + { + "cell_type": "markdown", + "id": "da724549", + "metadata": {}, + "source": [ + "## Amazon Neptune Data Upload" + ] + }, + { + "cell_type": "markdown", + "id": "5ead358b", + "metadata": {}, + "source": [ + "Now that we have successfully decomposed our caffeine SMILES string into individual atoms and bonds, the next step is to load our data into the Neptune database itself. This will be much simpler than loading data from an external source because our data is already inside the graph notebook environment. In order to write our data to the Neptune database, we will be using the **AWS SDK for pandas**, also known as **awswrangler**" + ] + }, + { + "cell_type": "markdown", + "id": "271ac986", + "metadata": {}, + "source": [ + "First, we need to check our notebook configuration to gather the host endpoint for our cluster. Running the cell below will provide that information, along with other important details about our Neptune database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61c779af", + "metadata": {}, + "outputs": [], + "source": [ + "%graph_notebook_config" + ] + }, + { + "cell_type": "markdown", + "id": "66745fed", + "metadata": {}, + "source": [ + "Find the `host` field from the above output and copy & paste the string into the cell below where it says `'[INSERT YOUR HOST HERE]'`. Also, check the port number above, ensure that the port number above is the same as the second parameter in the cell below. The default port number should be `8182`." + ] + }, + { + "cell_type": "markdown", + "id": "7343a1e7", + "metadata": {}, + "source": [ + "Run cell below once the host & port is properly copied from your graph configuration output above. This cell is simply using a command from the Neptune section of the awswrangler library to establish a connection to your Neptune instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b5da429", + "metadata": {}, + "outputs": [], + "source": [ + "client = wr.neptune.connect(\"INSERT YOUR HOST HERE\", 8182, iam_enabled=False)" + ] + }, + { + "cell_type": "markdown", + "id": "d26e71fd", + "metadata": {}, + "source": [ + "The next two cells use the `.to_proprty_graph` functions within awswrangler to insert both the node & edge data frames we created earlier into our Neptune database. Both cells should return a `\"True\"` upon success." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60f6dcf4", + "metadata": {}, + "outputs": [], + "source": [ + "wr.neptune.to_property_graph(client, df=nodes_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e06f3138", + "metadata": {}, + "outputs": [], + "source": [ + "wr.neptune.to_property_graph(client, df=edges_df)" + ] + }, + { + "cell_type": "markdown", + "id": "8064ef96", + "metadata": {}, + "source": [ + "After recieving a `\"True\"` output from both cells, you are finished with processing your SMILES molecule string. Now you can move onto visualizing your molecule as graph data. If you wish to add additional compounds to your graph database, you can return to the start of the [RDKit Processing](#RDKit-Processing) section and simply redo the process with a different SMILES string." + ] + }, + { + "cell_type": "markdown", + "id": "3da70417", + "metadata": {}, + "source": [ + "## Basic Visualization & Queries" + ] + }, + { + "cell_type": "markdown", + "id": "cb19689b", + "metadata": {}, + "source": [ + "Now that we have processed and loaded our molecular data, let's visualize the results of our efforts so far. The below cell uses a Gremlin query to traverse outwards from nodes which are labeled as **'C'** (*for the element Carbon*), giving us a picture of the overall structure of our molecule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dbb8414", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%gremlin -p v,ine,outv,oute,inv,oute,inv,oute,inv,oute,inv\n", + "g.V().has('~label','C').repeat(outE().inV()).emit().times(5).path().by(valueMap(true))" + ] + }, + { + "cell_type": "markdown", + "id": "553a58d8", + "metadata": {}, + "source": [ + "You should receive an interactive view of the 2D image RDKit produced earlier as the output of the above cell. Feel free to explore the structure and compare it to other visualizations you might be able to find of your molecule. Also, now that our molecule is persisted as graph data, you can manipulate, edit, and add additional data to your molecular structure as you see fit." + ] + }, + { + "cell_type": "markdown", + "id": "2bc4961d", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "a7dd3d35", + "metadata": {}, + "source": [ + "In this guide, you ingested and parsed a **SMILES** format molecular data string with **RDKit** and\n", + "uploaded the individual atoms and bonds as graph data to **Amazon Neptune**. You can replicate this\n", + "process at *scale* to accommodate large datasets containing many SMILES strings. You can test\n", + "this yourself by following the steps for any SMILES string of your choice. With the molecular\n", + "data broken into individual atoms and bonds in Neptune, you can connect this data to **custom\n", + "bioinformatics applications**, **chemical computing systems**, and **research software environments**.\n", + "You can take this solution even further by integrating [Amazon Neptune ML](https://aws.amazon.com/blogs/database/how-to-get-started-with-neptune-ml/) to gain the ability to\n", + "predict the connections and properties of your molecules." + ] + }, + { + "cell_type": "markdown", + "id": "b360fb55", + "metadata": {}, + "source": [ + "**See these other resources below to learn more about Amazon Neptune's role in Healthcare & Life Sciences:**" + ] + }, + { + "cell_type": "markdown", + "id": "fefa124d", + "metadata": {}, + "source": [ + "- [Accelerating drug discovery through knowledge graphs](https://aws.amazon.com/blogs/industries/accelerating-drug-discovery-through-knowledge-graph/)\n", + "\n", + "- [Analyze healthcare FHIR data with Amazon Neptune](https://aws.amazon.com/blogs/database/analyze-healthcare-fhir-data-with-amazon-neptune/)\n", + "\n", + "- [Building Amazon Neptune based MedDRA terminology mapping for pharmacovigilance and adverse event reporting](https://aws.amazon.com/blogs/industries/building-amazon-neptune-based-meddra-terminology-mapping-for-pharmacovigilance-and-adverse-event-reporting/)\n", + "\n", + "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)" + ] + }, + { + "cell_type": "markdown", + "id": "6f7c99a3", + "metadata": {}, + "source": [ + "## Clean Up" + ] + }, + { + "cell_type": "markdown", + "id": "9b35cd1c", + "metadata": {}, + "source": [ + "*WILL DELETE MOLECULE DATA*" + ] + }, + { + "cell_type": "markdown", + "id": "9d267a23", + "metadata": {}, + "source": [ + "Run the cell below to delete the data in the graph if you want to clean your graph database storage. This iterates through the `nodes_df`, so be sure to adjust accordingly if you have added any of your own additional molecules or edits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363ea618", + "metadata": {}, + "outputs": [], + "source": [ + "for i in nodes_df['~id']:\n", + " wr.neptune.execute_gremlin(client, \"g.V().has('~id', '\"+i+\"').drop();\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c13d347", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py b/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py new file mode 100644 index 00000000..e69de29b From 20f0540a066891fa9bce396a38219989bbf90007 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 15:24:07 -0700 Subject: [PATCH 02/15] Delete src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications directory --- ...lar-Structures-as-Graph-Data-Gremlin.ipynb | 623 ------------------ .../04-Life-Science-Applications/__init__.py | 0 2 files changed, 623 deletions(-) delete mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb delete mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb deleted file mode 100644 index f6081925..00000000 --- a/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb +++ /dev/null @@ -1,623 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01135e5b", - "metadata": {}, - "source": [ - "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. BSD 3-Clause \"New\" or \"Revised\" License" - ] - }, - { - "cell_type": "markdown", - "id": "e147f141", - "metadata": {}, - "source": [ - "# Model molecular SMILES data with Amazon Neptune and RDKit" - ] - }, - { - "cell_type": "markdown", - "id": "3b87be86", - "metadata": {}, - "source": [ - "This notebook walks through the process of modeling chemical structures using Amazon Neptune. It includes transforming a chemical compound represented as a SMILES string into graph data, with nodes representing individual atoms and edges representing the bonds between atoms.This notebook then closes with a visualization and exploration of the molecule caffeine." - ] - }, - { - "cell_type": "markdown", - "id": "9716ef94", - "metadata": {}, - "source": [ - " - [Background](#Background)\n", - "\n", - " - [Solution Overview](#Solution-Overview)\n", - "\n", - " - [Package Setup](#Package-Setup)\n", - "\n", - " - [Graph Data Model](#Graph-Data-Model)\n", - "\n", - " - [RDKit Processing](#RDKit-Processing)\n", - "\n", - " - [Amazon Neptune Data Upload](#Data-Upload)\n", - "\n", - " - [Basic Visualization & Queries](#Basic-Visualization-&-Queries)\n", - "\n", - " - [Clean Up](#Clean-Up)" - ] - }, - { - "cell_type": "markdown", - "id": "23409061", - "metadata": {}, - "source": [ - "## Background" - ] - }, - { - "cell_type": "markdown", - "id": "63016f6c", - "metadata": {}, - "source": [ - " Modeling chemical structures can be a complex and tedious process, even with the help of modern programs and technology. The ability to explore chemical structures at the most fundamental level of atoms and the bonds that connect them is an essential process in [drug discovery, pharmaceutical research](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00947), and [chemical engineering](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8130509/). By infusing chemical research with technology, researchers can expedite outcome timelines, identify hidden relationships, and overall simplify a traditionally complex process. " - ] - }, - { - "cell_type": "markdown", - "id": "a9bfdfa6", - "metadata": {}, - "source": [ - "## Solution Overview" - ] - }, - { - "cell_type": "markdown", - "id": "33e4d4ac", - "metadata": {}, - "source": [ - "In order to integrate technology into the analysis of chemical structures, molecules themselves must first be represented in a machine-readable format, such as [SMILES (simplified molecular-input line-entry system)](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system). **SMILES** format strings are the industry standard in representing molecular structures. The SMILES format enables the relationships between atoms in a molecular structure to be conveyed as a machine processable string. The SMILES format is not all encompassing, leaving out details such as certain polarities and bond properties. However, SMILES does enable powerful analysis at scale of different structures. \n", - "\n", - "Using Amazon Neptune and the open-source chemical-Informaics software package [RDKit](https://www.rdkit.org/), SMILES format data can be ingested, processed, and converted into nodes and edges in a property graph. Modeling molecular structures in a graph database allows for powerful custom visualization and manipulation at the scale demanded by pharmaceutical applications. Utilizing a graph database such as Neptune allows users to compare millions of molecules with millions of associated interactions. Additionally, the fully managed and serverless infrastructure allows experts with backgrounds in biology and chemistry to focus primarily on the research outcomes of their graph data, avoiding the undifferentiated heavy lifting of managing a complex graph database infrastructure.\n", - "\n", - "This walkthrough follows the process of converting a singular SMILES string, **caffeine**, to graph data in Neptune. However, the process will work for any SMILES format string you would like to use. We’re sourcing the string for caffeine from the [National Library of Medicine](https://pubchem.ncbi.nlm.nih.gov/compound/2519#section=InChI), which maintains a public dataset of many chemical structures [**CN1C=NC2=C1C(=O)N(C(=O)N2C)C**].\n" - ] - }, - { - "cell_type": "markdown", - "id": "79e81e96", - "metadata": {}, - "source": [ - "We also use the open-source cheminformatics package [RDKit](https://github.com/rdkit/rdkit), Python-based data science package [Pandas](https://pandas.pydata.org/), and [AWS SDK for pandas (awswrangler)](https://aws-sdk-pandas.readthedocs.io/en/stable/). RDKit has a strong community and a great number of chem-informatics utilities; we’ll only be exploring a small portion for this post. Pandas is an open-source Python-based data science toolkit with large community support. The AWS SDK for pandas provides a large set of tools to help AWS services interact with the pandas library." - ] - }, - { - "cell_type": "markdown", - "id": "c49cf299", - "metadata": {}, - "source": [ - "## Package Setup" - ] - }, - { - "cell_type": "markdown", - "id": "4e1c8a9d", - "metadata": {}, - "source": [ - "The first step in modeling a chemical structure as graph data is importing the required packages. Here we will be using **RDKit**, **Pandas**, and **awswrangler**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cb5f3b98", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install rdkit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "440c97e6", - "metadata": {}, - "outputs": [], - "source": [ - "%pip install awswrangler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9a5625f3", - "metadata": {}, - "outputs": [], - "source": [ - "from rdkit import Chem\n", - "import pandas as pd\n", - "import awswrangler as wr" - ] - }, - { - "cell_type": "markdown", - "id": "24e2d68d", - "metadata": {}, - "source": [ - "## Graph Data Model" - ] - }, - { - "cell_type": "markdown", - "id": "536a3491", - "metadata": {}, - "source": [ - "There are a few different options for graph query languages and their associated data models when working with Neptune; in this case we’re using **Apache TinkerPop’s Gremlin**. We are opting for Gremlin here due to its intuitive nature and easy to learn syntax. The cell below is defining a dictionary object for both the nodes and edges of our graph. Within each dictionary object is a set of properties we will gather from our caffeine molecule in the next section." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a8cd157", - "metadata": {}, - "outputs": [], - "source": [ - "nodes_dict = {'~id':[],\n", - " '~label':[],\n", - " 'idx':[],\n", - " 'atomicNumber':[],\n", - " 'isAromatic': []\n", - " }\n", - " \n", - "edges_dict = {'~id':[],\n", - " '~label':[],\n", - " '~from':[],\n", - " '~to':[],\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "d049b9e6", - "metadata": {}, - "source": [ - "## RDKit Processing" - ] - }, - { - "cell_type": "markdown", - "id": "21846f4c", - "metadata": {}, - "source": [ - "This section is where the chemical computing magic happens. We use the `rdkit` package installed earlier in the graph notebook to decompose our chemical structure into lists of **nodes (atoms)** and **edges (bonds)**." - ] - }, - { - "cell_type": "markdown", - "id": "c69d0e0d", - "metadata": {}, - "source": [ - "First, we want to declare our SMILES string for the caffeine molecule as a variable." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "33a9fdf0", - "metadata": {}, - "outputs": [], - "source": [ - "caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'" - ] - }, - { - "cell_type": "markdown", - "id": "cacd427b", - "metadata": {}, - "source": [ - "Next, to obtain a molecule-type object from the RDKit package, we need to use the below call to the `Chem` library from RDKit imported earlier." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "208de8b7", - "metadata": {}, - "outputs": [], - "source": [ - "mol = Chem.MolFromSmiles(caffeine_smiles)" - ] - }, - { - "cell_type": "markdown", - "id": "45119037", - "metadata": {}, - "source": [ - "Let’s see a visual of our work so far, run the following cell to output a 2D picture of our molecule below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7260f5c8", - "metadata": {}, - "outputs": [], - "source": [ - "mol" - ] - }, - { - "cell_type": "markdown", - "id": "b1932568", - "metadata": {}, - "source": [ - "To recap what we just did, first we declared our SMILES string for caffeine as the variable `caffeine_smiles`. Next, we used the `Chem.MolFromSmiles` function from RDKit to turn the SMILES into a `mol` type object defined by RDKit. Finally, we returned the `mol` type object which resulted in a 2D image of the molecular structure for caffeine that we were working with." - ] - }, - { - "cell_type": "markdown", - "id": "0cbd5ec2", - "metadata": {}, - "source": [ - "Now we need to iterate through each atom and bond within the `mol` object outputted from RDKit. While iterating through each atom and bond, we use the graph data model we declared earlier, storing properties of each inside the data model. Feel free to dive deeper into the `mol.GetAtoms()` and `mol.GetBonds()` function calls on your own - we are only exploring a small subset of their functionality." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d3b030e4", - "metadata": {}, - "outputs": [], - "source": [ - "for atom in mol.GetAtoms():\n", - " nodes_dict['~id'].append('Node-'+ caffeine_smiles + str(atom.GetIdx()))\n", - " nodes_dict['~label'].append(atom.GetSymbol())\n", - " nodes_dict['idx'].append(atom.GetIdx())\n", - " nodes_dict['atomicNumber'].append(atom.GetAtomicNum())\n", - " nodes_dict['isAromatic'].append(atom.GetIsAromatic())\n", - "\n", - "for bond in mol.GetBonds():\n", - " edges_dict['~id'].append('edge-'+ caffeine_smiles + str(bond.GetBeginAtomIdx()) + str(bond.GetEndAtomIdx()))\n", - " edges_dict['~label'].append(str(bond.GetBondType()))\n", - " edges_dict['~from'].append('Node-' + caffeine_smiles + str(bond.GetBeginAtomIdx()))\n", - " edges_dict['~to'].append('Node-' + caffeine_smiles + str(bond.GetEndAtomIdx()))" - ] - }, - { - "cell_type": "markdown", - "id": "09038090", - "metadata": {}, - "source": [ - "Several different RDKit functions are in this portion of code, so let’s break it down piece by piece: " - ] - }, - { - "cell_type": "markdown", - "id": "caeeaeb3", - "metadata": {}, - "source": [ - "•\tFor the `~id` field of both nodes and edges, we combine the data type `Node` or `Edge`, the SMILES string, and the unique index for the atom" - ] - }, - { - "cell_type": "markdown", - "id": "2e8287cf", - "metadata": {}, - "source": [ - "•\tFor the `~label` field, we use the chemical symbol for nodes, and the bond type for the edges" - ] - }, - { - "cell_type": "markdown", - "id": "19bf0671", - "metadata": {}, - "source": [ - "•\tThe fields `~from` and `~to` for the edges (bonds) are constructed by combining the prefix `Node-` with the SMILES string, and the respective beginning and ending atoms that the bond connects" - ] - }, - { - "cell_type": "markdown", - "id": "0da3d8f8", - "metadata": {}, - "source": [ - "•\tThe additional fields for the nodes (atoms) in the graph model are the atom’s unique ID within the molecule, its atomic number, and if it is aromatic or not" - ] - }, - { - "cell_type": "markdown", - "id": "3ccc7a32", - "metadata": {}, - "source": [ - "Note that you can extract several atomic properties for a given SMILES string in RDKit and add them as additional fields for a given atom or bond. We don’t list them all in this post, but you can explore additional fields for both the atoms and bonds." - ] - }, - { - "cell_type": "markdown", - "id": "bcbf50b9", - "metadata": {}, - "source": [ - "Finally, we want to use **Pandas** to transform our data-poulated dictionaries into pandas data frames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab3a8fa7", - "metadata": {}, - "outputs": [], - "source": [ - "nodes_df = pd.DataFrame.from_dict(nodes_dict)\n", - "edges_df = pd.DataFrame.from_dict(edges_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "35572287", - "metadata": {}, - "source": [ - "let's check the results of our work so far. Running the cells below should return data frames for both the edges and nodes of our caffeine molecule." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f8df5de", - "metadata": {}, - "outputs": [], - "source": [ - "edges_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1096a312", - "metadata": {}, - "outputs": [], - "source": [ - "nodes_df" - ] - }, - { - "cell_type": "markdown", - "id": "da724549", - "metadata": {}, - "source": [ - "## Amazon Neptune Data Upload" - ] - }, - { - "cell_type": "markdown", - "id": "5ead358b", - "metadata": {}, - "source": [ - "Now that we have successfully decomposed our caffeine SMILES string into individual atoms and bonds, the next step is to load our data into the Neptune database itself. This will be much simpler than loading data from an external source because our data is already inside the graph notebook environment. In order to write our data to the Neptune database, we will be using the **AWS SDK for pandas**, also known as **awswrangler**" - ] - }, - { - "cell_type": "markdown", - "id": "271ac986", - "metadata": {}, - "source": [ - "First, we need to check our notebook configuration to gather the host endpoint for our cluster. Running the cell below will provide that information, along with other important details about our Neptune database." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61c779af", - "metadata": {}, - "outputs": [], - "source": [ - "%graph_notebook_config" - ] - }, - { - "cell_type": "markdown", - "id": "66745fed", - "metadata": {}, - "source": [ - "Find the `host` field from the above output and copy & paste the string into the cell below where it says `'[INSERT YOUR HOST HERE]'`. Also, check the port number above, ensure that the port number above is the same as the second parameter in the cell below. The default port number should be `8182`." - ] - }, - { - "cell_type": "markdown", - "id": "7343a1e7", - "metadata": {}, - "source": [ - "Run cell below once the host & port is properly copied from your graph configuration output above. This cell is simply using a command from the Neptune section of the awswrangler library to establish a connection to your Neptune instance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b5da429", - "metadata": {}, - "outputs": [], - "source": [ - "client = wr.neptune.connect(\"INSERT YOUR HOST HERE\", 8182, iam_enabled=False)" - ] - }, - { - "cell_type": "markdown", - "id": "d26e71fd", - "metadata": {}, - "source": [ - "The next two cells use the `.to_proprty_graph` functions within awswrangler to insert both the node & edge data frames we created earlier into our Neptune database. Both cells should return a `\"True\"` upon success." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60f6dcf4", - "metadata": {}, - "outputs": [], - "source": [ - "wr.neptune.to_property_graph(client, df=nodes_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e06f3138", - "metadata": {}, - "outputs": [], - "source": [ - "wr.neptune.to_property_graph(client, df=edges_df)" - ] - }, - { - "cell_type": "markdown", - "id": "8064ef96", - "metadata": {}, - "source": [ - "After recieving a `\"True\"` output from both cells, you are finished with processing your SMILES molecule string. Now you can move onto visualizing your molecule as graph data. If you wish to add additional compounds to your graph database, you can return to the start of the [RDKit Processing](#RDKit-Processing) section and simply redo the process with a different SMILES string." - ] - }, - { - "cell_type": "markdown", - "id": "3da70417", - "metadata": {}, - "source": [ - "## Basic Visualization & Queries" - ] - }, - { - "cell_type": "markdown", - "id": "cb19689b", - "metadata": {}, - "source": [ - "Now that we have processed and loaded our molecular data, let's visualize the results of our efforts so far. The below cell uses a Gremlin query to traverse outwards from nodes which are labeled as **'C'** (*for the element Carbon*), giving us a picture of the overall structure of our molecule." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5dbb8414", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%gremlin -p v,ine,outv,oute,inv,oute,inv,oute,inv,oute,inv\n", - "g.V().has('~label','C').repeat(outE().inV()).emit().times(5).path().by(valueMap(true))" - ] - }, - { - "cell_type": "markdown", - "id": "553a58d8", - "metadata": {}, - "source": [ - "You should receive an interactive view of the 2D image RDKit produced earlier as the output of the above cell. Feel free to explore the structure and compare it to other visualizations you might be able to find of your molecule. Also, now that our molecule is persisted as graph data, you can manipulate, edit, and add additional data to your molecular structure as you see fit." - ] - }, - { - "cell_type": "markdown", - "id": "2bc4961d", - "metadata": {}, - "source": [ - "## Conclusion" - ] - }, - { - "cell_type": "markdown", - "id": "a7dd3d35", - "metadata": {}, - "source": [ - "In this guide, you ingested and parsed a **SMILES** format molecular data string with **RDKit** and\n", - "uploaded the individual atoms and bonds as graph data to **Amazon Neptune**. You can replicate this\n", - "process at *scale* to accommodate large datasets containing many SMILES strings. You can test\n", - "this yourself by following the steps for any SMILES string of your choice. With the molecular\n", - "data broken into individual atoms and bonds in Neptune, you can connect this data to **custom\n", - "bioinformatics applications**, **chemical computing systems**, and **research software environments**.\n", - "You can take this solution even further by integrating [Amazon Neptune ML](https://aws.amazon.com/blogs/database/how-to-get-started-with-neptune-ml/) to gain the ability to\n", - "predict the connections and properties of your molecules." - ] - }, - { - "cell_type": "markdown", - "id": "b360fb55", - "metadata": {}, - "source": [ - "**See these other resources below to learn more about Amazon Neptune's role in Healthcare & Life Sciences:**" - ] - }, - { - "cell_type": "markdown", - "id": "fefa124d", - "metadata": {}, - "source": [ - "- [Accelerating drug discovery through knowledge graphs](https://aws.amazon.com/blogs/industries/accelerating-drug-discovery-through-knowledge-graph/)\n", - "\n", - "- [Analyze healthcare FHIR data with Amazon Neptune](https://aws.amazon.com/blogs/database/analyze-healthcare-fhir-data-with-amazon-neptune/)\n", - "\n", - "- [Building Amazon Neptune based MedDRA terminology mapping for pharmacovigilance and adverse event reporting](https://aws.amazon.com/blogs/industries/building-amazon-neptune-based-meddra-terminology-mapping-for-pharmacovigilance-and-adverse-event-reporting/)\n", - "\n", - "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)" - ] - }, - { - "cell_type": "markdown", - "id": "6f7c99a3", - "metadata": {}, - "source": [ - "## Clean Up" - ] - }, - { - "cell_type": "markdown", - "id": "9b35cd1c", - "metadata": {}, - "source": [ - "*WILL DELETE MOLECULE DATA*" - ] - }, - { - "cell_type": "markdown", - "id": "9d267a23", - "metadata": {}, - "source": [ - "Run the cell below to delete the data in the graph if you want to clean your graph database storage. This iterates through the `nodes_df`, so be sure to adjust accordingly if you have added any of your own additional molecules or edits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "363ea618", - "metadata": {}, - "outputs": [], - "source": [ - "for i in nodes_df['~id']:\n", - " wr.neptune.execute_gremlin(client, \"g.V().has('~id', '\"+i+\"').drop();\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c13d347", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py b/src/graph_notebook/notebooks/03-Sample-Applications/04-Life-Science-Applications/__init__.py deleted file mode 100644 index e69de29b..00000000 From 24b1e70272b407fa1882c432f333279872cae8cc Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 15:25:15 -0700 Subject: [PATCH 03/15] Add files via upload --- ...lar-Structures-as-Graph-Data-Gremlin.ipynb | 623 ++++++++++++++++++ .../05-Life-Science-Applications/__init__.py | 0 2 files changed, 623 insertions(+) create mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb create mode 100644 src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/__init__.py diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb new file mode 100644 index 00000000..f6081925 --- /dev/null +++ b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -0,0 +1,623 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01135e5b", + "metadata": {}, + "source": [ + "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. BSD 3-Clause \"New\" or \"Revised\" License" + ] + }, + { + "cell_type": "markdown", + "id": "e147f141", + "metadata": {}, + "source": [ + "# Model molecular SMILES data with Amazon Neptune and RDKit" + ] + }, + { + "cell_type": "markdown", + "id": "3b87be86", + "metadata": {}, + "source": [ + "This notebook walks through the process of modeling chemical structures using Amazon Neptune. It includes transforming a chemical compound represented as a SMILES string into graph data, with nodes representing individual atoms and edges representing the bonds between atoms.This notebook then closes with a visualization and exploration of the molecule caffeine." + ] + }, + { + "cell_type": "markdown", + "id": "9716ef94", + "metadata": {}, + "source": [ + " - [Background](#Background)\n", + "\n", + " - [Solution Overview](#Solution-Overview)\n", + "\n", + " - [Package Setup](#Package-Setup)\n", + "\n", + " - [Graph Data Model](#Graph-Data-Model)\n", + "\n", + " - [RDKit Processing](#RDKit-Processing)\n", + "\n", + " - [Amazon Neptune Data Upload](#Data-Upload)\n", + "\n", + " - [Basic Visualization & Queries](#Basic-Visualization-&-Queries)\n", + "\n", + " - [Clean Up](#Clean-Up)" + ] + }, + { + "cell_type": "markdown", + "id": "23409061", + "metadata": {}, + "source": [ + "## Background" + ] + }, + { + "cell_type": "markdown", + "id": "63016f6c", + "metadata": {}, + "source": [ + " Modeling chemical structures can be a complex and tedious process, even with the help of modern programs and technology. The ability to explore chemical structures at the most fundamental level of atoms and the bonds that connect them is an essential process in [drug discovery, pharmaceutical research](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00947), and [chemical engineering](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8130509/). By infusing chemical research with technology, researchers can expedite outcome timelines, identify hidden relationships, and overall simplify a traditionally complex process. " + ] + }, + { + "cell_type": "markdown", + "id": "a9bfdfa6", + "metadata": {}, + "source": [ + "## Solution Overview" + ] + }, + { + "cell_type": "markdown", + "id": "33e4d4ac", + "metadata": {}, + "source": [ + "In order to integrate technology into the analysis of chemical structures, molecules themselves must first be represented in a machine-readable format, such as [SMILES (simplified molecular-input line-entry system)](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system). **SMILES** format strings are the industry standard in representing molecular structures. The SMILES format enables the relationships between atoms in a molecular structure to be conveyed as a machine processable string. The SMILES format is not all encompassing, leaving out details such as certain polarities and bond properties. However, SMILES does enable powerful analysis at scale of different structures. \n", + "\n", + "Using Amazon Neptune and the open-source chemical-Informaics software package [RDKit](https://www.rdkit.org/), SMILES format data can be ingested, processed, and converted into nodes and edges in a property graph. Modeling molecular structures in a graph database allows for powerful custom visualization and manipulation at the scale demanded by pharmaceutical applications. Utilizing a graph database such as Neptune allows users to compare millions of molecules with millions of associated interactions. Additionally, the fully managed and serverless infrastructure allows experts with backgrounds in biology and chemistry to focus primarily on the research outcomes of their graph data, avoiding the undifferentiated heavy lifting of managing a complex graph database infrastructure.\n", + "\n", + "This walkthrough follows the process of converting a singular SMILES string, **caffeine**, to graph data in Neptune. However, the process will work for any SMILES format string you would like to use. We’re sourcing the string for caffeine from the [National Library of Medicine](https://pubchem.ncbi.nlm.nih.gov/compound/2519#section=InChI), which maintains a public dataset of many chemical structures [**CN1C=NC2=C1C(=O)N(C(=O)N2C)C**].\n" + ] + }, + { + "cell_type": "markdown", + "id": "79e81e96", + "metadata": {}, + "source": [ + "We also use the open-source cheminformatics package [RDKit](https://github.com/rdkit/rdkit), Python-based data science package [Pandas](https://pandas.pydata.org/), and [AWS SDK for pandas (awswrangler)](https://aws-sdk-pandas.readthedocs.io/en/stable/). RDKit has a strong community and a great number of chem-informatics utilities; we’ll only be exploring a small portion for this post. Pandas is an open-source Python-based data science toolkit with large community support. The AWS SDK for pandas provides a large set of tools to help AWS services interact with the pandas library." + ] + }, + { + "cell_type": "markdown", + "id": "c49cf299", + "metadata": {}, + "source": [ + "## Package Setup" + ] + }, + { + "cell_type": "markdown", + "id": "4e1c8a9d", + "metadata": {}, + "source": [ + "The first step in modeling a chemical structure as graph data is importing the required packages. Here we will be using **RDKit**, **Pandas**, and **awswrangler**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb5f3b98", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install rdkit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "440c97e6", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install awswrangler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a5625f3", + "metadata": {}, + "outputs": [], + "source": [ + "from rdkit import Chem\n", + "import pandas as pd\n", + "import awswrangler as wr" + ] + }, + { + "cell_type": "markdown", + "id": "24e2d68d", + "metadata": {}, + "source": [ + "## Graph Data Model" + ] + }, + { + "cell_type": "markdown", + "id": "536a3491", + "metadata": {}, + "source": [ + "There are a few different options for graph query languages and their associated data models when working with Neptune; in this case we’re using **Apache TinkerPop’s Gremlin**. We are opting for Gremlin here due to its intuitive nature and easy to learn syntax. The cell below is defining a dictionary object for both the nodes and edges of our graph. Within each dictionary object is a set of properties we will gather from our caffeine molecule in the next section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8cd157", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_dict = {'~id':[],\n", + " '~label':[],\n", + " 'idx':[],\n", + " 'atomicNumber':[],\n", + " 'isAromatic': []\n", + " }\n", + " \n", + "edges_dict = {'~id':[],\n", + " '~label':[],\n", + " '~from':[],\n", + " '~to':[],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "d049b9e6", + "metadata": {}, + "source": [ + "## RDKit Processing" + ] + }, + { + "cell_type": "markdown", + "id": "21846f4c", + "metadata": {}, + "source": [ + "This section is where the chemical computing magic happens. We use the `rdkit` package installed earlier in the graph notebook to decompose our chemical structure into lists of **nodes (atoms)** and **edges (bonds)**." + ] + }, + { + "cell_type": "markdown", + "id": "c69d0e0d", + "metadata": {}, + "source": [ + "First, we want to declare our SMILES string for the caffeine molecule as a variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33a9fdf0", + "metadata": {}, + "outputs": [], + "source": [ + "caffeine_smiles = 'CN1C=NC2=C1C(=O)N(C(=O)N2C)C'" + ] + }, + { + "cell_type": "markdown", + "id": "cacd427b", + "metadata": {}, + "source": [ + "Next, to obtain a molecule-type object from the RDKit package, we need to use the below call to the `Chem` library from RDKit imported earlier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "208de8b7", + "metadata": {}, + "outputs": [], + "source": [ + "mol = Chem.MolFromSmiles(caffeine_smiles)" + ] + }, + { + "cell_type": "markdown", + "id": "45119037", + "metadata": {}, + "source": [ + "Let’s see a visual of our work so far, run the following cell to output a 2D picture of our molecule below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7260f5c8", + "metadata": {}, + "outputs": [], + "source": [ + "mol" + ] + }, + { + "cell_type": "markdown", + "id": "b1932568", + "metadata": {}, + "source": [ + "To recap what we just did, first we declared our SMILES string for caffeine as the variable `caffeine_smiles`. Next, we used the `Chem.MolFromSmiles` function from RDKit to turn the SMILES into a `mol` type object defined by RDKit. Finally, we returned the `mol` type object which resulted in a 2D image of the molecular structure for caffeine that we were working with." + ] + }, + { + "cell_type": "markdown", + "id": "0cbd5ec2", + "metadata": {}, + "source": [ + "Now we need to iterate through each atom and bond within the `mol` object outputted from RDKit. While iterating through each atom and bond, we use the graph data model we declared earlier, storing properties of each inside the data model. Feel free to dive deeper into the `mol.GetAtoms()` and `mol.GetBonds()` function calls on your own - we are only exploring a small subset of their functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3b030e4", + "metadata": {}, + "outputs": [], + "source": [ + "for atom in mol.GetAtoms():\n", + " nodes_dict['~id'].append('Node-'+ caffeine_smiles + str(atom.GetIdx()))\n", + " nodes_dict['~label'].append(atom.GetSymbol())\n", + " nodes_dict['idx'].append(atom.GetIdx())\n", + " nodes_dict['atomicNumber'].append(atom.GetAtomicNum())\n", + " nodes_dict['isAromatic'].append(atom.GetIsAromatic())\n", + "\n", + "for bond in mol.GetBonds():\n", + " edges_dict['~id'].append('edge-'+ caffeine_smiles + str(bond.GetBeginAtomIdx()) + str(bond.GetEndAtomIdx()))\n", + " edges_dict['~label'].append(str(bond.GetBondType()))\n", + " edges_dict['~from'].append('Node-' + caffeine_smiles + str(bond.GetBeginAtomIdx()))\n", + " edges_dict['~to'].append('Node-' + caffeine_smiles + str(bond.GetEndAtomIdx()))" + ] + }, + { + "cell_type": "markdown", + "id": "09038090", + "metadata": {}, + "source": [ + "Several different RDKit functions are in this portion of code, so let’s break it down piece by piece: " + ] + }, + { + "cell_type": "markdown", + "id": "caeeaeb3", + "metadata": {}, + "source": [ + "•\tFor the `~id` field of both nodes and edges, we combine the data type `Node` or `Edge`, the SMILES string, and the unique index for the atom" + ] + }, + { + "cell_type": "markdown", + "id": "2e8287cf", + "metadata": {}, + "source": [ + "•\tFor the `~label` field, we use the chemical symbol for nodes, and the bond type for the edges" + ] + }, + { + "cell_type": "markdown", + "id": "19bf0671", + "metadata": {}, + "source": [ + "•\tThe fields `~from` and `~to` for the edges (bonds) are constructed by combining the prefix `Node-` with the SMILES string, and the respective beginning and ending atoms that the bond connects" + ] + }, + { + "cell_type": "markdown", + "id": "0da3d8f8", + "metadata": {}, + "source": [ + "•\tThe additional fields for the nodes (atoms) in the graph model are the atom’s unique ID within the molecule, its atomic number, and if it is aromatic or not" + ] + }, + { + "cell_type": "markdown", + "id": "3ccc7a32", + "metadata": {}, + "source": [ + "Note that you can extract several atomic properties for a given SMILES string in RDKit and add them as additional fields for a given atom or bond. We don’t list them all in this post, but you can explore additional fields for both the atoms and bonds." + ] + }, + { + "cell_type": "markdown", + "id": "bcbf50b9", + "metadata": {}, + "source": [ + "Finally, we want to use **Pandas** to transform our data-poulated dictionaries into pandas data frames" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab3a8fa7", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_df = pd.DataFrame.from_dict(nodes_dict)\n", + "edges_df = pd.DataFrame.from_dict(edges_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "35572287", + "metadata": {}, + "source": [ + "let's check the results of our work so far. Running the cells below should return data frames for both the edges and nodes of our caffeine molecule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f8df5de", + "metadata": {}, + "outputs": [], + "source": [ + "edges_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1096a312", + "metadata": {}, + "outputs": [], + "source": [ + "nodes_df" + ] + }, + { + "cell_type": "markdown", + "id": "da724549", + "metadata": {}, + "source": [ + "## Amazon Neptune Data Upload" + ] + }, + { + "cell_type": "markdown", + "id": "5ead358b", + "metadata": {}, + "source": [ + "Now that we have successfully decomposed our caffeine SMILES string into individual atoms and bonds, the next step is to load our data into the Neptune database itself. This will be much simpler than loading data from an external source because our data is already inside the graph notebook environment. In order to write our data to the Neptune database, we will be using the **AWS SDK for pandas**, also known as **awswrangler**" + ] + }, + { + "cell_type": "markdown", + "id": "271ac986", + "metadata": {}, + "source": [ + "First, we need to check our notebook configuration to gather the host endpoint for our cluster. Running the cell below will provide that information, along with other important details about our Neptune database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61c779af", + "metadata": {}, + "outputs": [], + "source": [ + "%graph_notebook_config" + ] + }, + { + "cell_type": "markdown", + "id": "66745fed", + "metadata": {}, + "source": [ + "Find the `host` field from the above output and copy & paste the string into the cell below where it says `'[INSERT YOUR HOST HERE]'`. Also, check the port number above, ensure that the port number above is the same as the second parameter in the cell below. The default port number should be `8182`." + ] + }, + { + "cell_type": "markdown", + "id": "7343a1e7", + "metadata": {}, + "source": [ + "Run cell below once the host & port is properly copied from your graph configuration output above. This cell is simply using a command from the Neptune section of the awswrangler library to establish a connection to your Neptune instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b5da429", + "metadata": {}, + "outputs": [], + "source": [ + "client = wr.neptune.connect(\"INSERT YOUR HOST HERE\", 8182, iam_enabled=False)" + ] + }, + { + "cell_type": "markdown", + "id": "d26e71fd", + "metadata": {}, + "source": [ + "The next two cells use the `.to_proprty_graph` functions within awswrangler to insert both the node & edge data frames we created earlier into our Neptune database. Both cells should return a `\"True\"` upon success." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60f6dcf4", + "metadata": {}, + "outputs": [], + "source": [ + "wr.neptune.to_property_graph(client, df=nodes_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e06f3138", + "metadata": {}, + "outputs": [], + "source": [ + "wr.neptune.to_property_graph(client, df=edges_df)" + ] + }, + { + "cell_type": "markdown", + "id": "8064ef96", + "metadata": {}, + "source": [ + "After recieving a `\"True\"` output from both cells, you are finished with processing your SMILES molecule string. Now you can move onto visualizing your molecule as graph data. If you wish to add additional compounds to your graph database, you can return to the start of the [RDKit Processing](#RDKit-Processing) section and simply redo the process with a different SMILES string." + ] + }, + { + "cell_type": "markdown", + "id": "3da70417", + "metadata": {}, + "source": [ + "## Basic Visualization & Queries" + ] + }, + { + "cell_type": "markdown", + "id": "cb19689b", + "metadata": {}, + "source": [ + "Now that we have processed and loaded our molecular data, let's visualize the results of our efforts so far. The below cell uses a Gremlin query to traverse outwards from nodes which are labeled as **'C'** (*for the element Carbon*), giving us a picture of the overall structure of our molecule." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5dbb8414", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%gremlin -p v,ine,outv,oute,inv,oute,inv,oute,inv,oute,inv\n", + "g.V().has('~label','C').repeat(outE().inV()).emit().times(5).path().by(valueMap(true))" + ] + }, + { + "cell_type": "markdown", + "id": "553a58d8", + "metadata": {}, + "source": [ + "You should receive an interactive view of the 2D image RDKit produced earlier as the output of the above cell. Feel free to explore the structure and compare it to other visualizations you might be able to find of your molecule. Also, now that our molecule is persisted as graph data, you can manipulate, edit, and add additional data to your molecular structure as you see fit." + ] + }, + { + "cell_type": "markdown", + "id": "2bc4961d", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "a7dd3d35", + "metadata": {}, + "source": [ + "In this guide, you ingested and parsed a **SMILES** format molecular data string with **RDKit** and\n", + "uploaded the individual atoms and bonds as graph data to **Amazon Neptune**. You can replicate this\n", + "process at *scale* to accommodate large datasets containing many SMILES strings. You can test\n", + "this yourself by following the steps for any SMILES string of your choice. With the molecular\n", + "data broken into individual atoms and bonds in Neptune, you can connect this data to **custom\n", + "bioinformatics applications**, **chemical computing systems**, and **research software environments**.\n", + "You can take this solution even further by integrating [Amazon Neptune ML](https://aws.amazon.com/blogs/database/how-to-get-started-with-neptune-ml/) to gain the ability to\n", + "predict the connections and properties of your molecules." + ] + }, + { + "cell_type": "markdown", + "id": "b360fb55", + "metadata": {}, + "source": [ + "**See these other resources below to learn more about Amazon Neptune's role in Healthcare & Life Sciences:**" + ] + }, + { + "cell_type": "markdown", + "id": "fefa124d", + "metadata": {}, + "source": [ + "- [Accelerating drug discovery through knowledge graphs](https://aws.amazon.com/blogs/industries/accelerating-drug-discovery-through-knowledge-graph/)\n", + "\n", + "- [Analyze healthcare FHIR data with Amazon Neptune](https://aws.amazon.com/blogs/database/analyze-healthcare-fhir-data-with-amazon-neptune/)\n", + "\n", + "- [Building Amazon Neptune based MedDRA terminology mapping for pharmacovigilance and adverse event reporting](https://aws.amazon.com/blogs/industries/building-amazon-neptune-based-meddra-terminology-mapping-for-pharmacovigilance-and-adverse-event-reporting/)\n", + "\n", + "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)" + ] + }, + { + "cell_type": "markdown", + "id": "6f7c99a3", + "metadata": {}, + "source": [ + "## Clean Up" + ] + }, + { + "cell_type": "markdown", + "id": "9b35cd1c", + "metadata": {}, + "source": [ + "*WILL DELETE MOLECULE DATA*" + ] + }, + { + "cell_type": "markdown", + "id": "9d267a23", + "metadata": {}, + "source": [ + "Run the cell below to delete the data in the graph if you want to clean your graph database storage. This iterates through the `nodes_df`, so be sure to adjust accordingly if you have added any of your own additional molecules or edits." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363ea618", + "metadata": {}, + "outputs": [], + "source": [ + "for i in nodes_df['~id']:\n", + " wr.neptune.execute_gremlin(client, \"g.V().has('~id', '\"+i+\"').drop();\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c13d347", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/__init__.py b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/__init__.py new file mode 100644 index 00000000..e69de29b From 75d1118157351618d9551fe06b396365ca8b4afa Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 16:03:57 -0700 Subject: [PATCH 04/15] Update README.md added in the life sciences section to serve as a home for the new sub-section of sample applications --- .../notebooks/03-Sample-Applications/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/README.md b/src/graph_notebook/notebooks/03-Sample-Applications/README.md index f67d868a..eb7efd05 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/README.md +++ b/src/graph_notebook/notebooks/03-Sample-Applications/README.md @@ -4,6 +4,7 @@ - [Knowledge Graph](#Knowledge-Graph) - [Identity Graph](#Identity-Graph) - [Security Graph](#Security-Graph) +- [Life Sciences](#Life-Sciences-Graph) - [Neptune ML - People Analytics](#Neptune-ML-People-Analytics) @@ -35,8 +36,14 @@ A security graph connects resources within a network: entities such as policies, The examples in this notebook shows a sample security graph solution using an open dataset as well as data visualizations that allow one to better understand the structure of a security graph. +## [Life Sciences](./05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb) + +A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment. + +The examples in this notebook show how to take the SMILES format string for the molecule caffeine and transform it into a graph representation. The notebook also includes exploration and visualization of the caffeine molecule as a graph. + ## [People Analytics using Machine Learning](../04-Machine-Learning/Sample-Applications/01-People-Analytics) Hiring and retaining good personnel is a key characteristic to making an organization successful. One way that organizations approach this problem is through the use of people analytics. People analytics allow business leaders to make data-driven decisions about personnel-related issues such as recruiting, evaluation, hiring, promotion, and retention, etc. -The examples in this notebook shows a sample people analytics graph solution using an open dataset that incorporates graph neural network based machine learning using Neptune ML. \ No newline at end of file +The examples in this notebook shows a sample people analytics graph solution using an open dataset that incorporates graph neural network based machine learning using Neptune ML. From bad2659efbe86eea6a60dc3ea55cc069dfb5d39f Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 16:07:04 -0700 Subject: [PATCH 05/15] Update test_validate_notebooks.py added unit test for the new notebook & folder life-sciences --- test/unit/notebooks/test_validate_notebooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/unit/notebooks/test_validate_notebooks.py b/test/unit/notebooks/test_validate_notebooks.py index 479f1917..de496db2 100644 --- a/test/unit/notebooks/test_validate_notebooks.py +++ b/test/unit/notebooks/test_validate_notebooks.py @@ -40,6 +40,7 @@ def test_no_extra_notebooks(self): f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/03-Identity-Graphs/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-openCypher.ipynb', + f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-00-Getting-Started-with-Neptune-ML-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-01-Introduction-to-Node-Classification-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-02-Introduction-to-Node-Regression-Gremlin.ipynb', From db2b1d9f6ce0b4ccd426fd16233ba6a6502603ad Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 2 May 2023 16:10:39 -0700 Subject: [PATCH 06/15] Adding in the copyright and license statement --- ...01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb index f6081925..4bec2b7b 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -5,7 +5,7 @@ "id": "01135e5b", "metadata": {}, "source": [ - "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. BSD 3-Clause \"New\" or \"Revised\" License" + "Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: Apache-2.0" ] }, { From 7a9752af6d6a3b2f2394149c628f98f66aa61f42 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Wed, 3 May 2023 11:19:20 -0700 Subject: [PATCH 07/15] Update test_validate_notebooks.py --- test/unit/notebooks/test_validate_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/notebooks/test_validate_notebooks.py b/test/unit/notebooks/test_validate_notebooks.py index de496db2..d185e06c 100644 --- a/test/unit/notebooks/test_validate_notebooks.py +++ b/test/unit/notebooks/test_validate_notebooks.py @@ -40,7 +40,7 @@ def test_no_extra_notebooks(self): f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/03-Identity-Graphs/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-openCypher.ipynb', - f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin', + f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-00-Getting-Started-with-Neptune-ML-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-01-Introduction-to-Node-Classification-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-02-Introduction-to-Node-Regression-Gremlin.ipynb', From 85b15ce66042626bc67b133107715972a048fcea Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Wed, 3 May 2023 11:25:28 -0700 Subject: [PATCH 08/15] Update 00-Sample-Applications-Overview.ipynb matching .READ_ME & overview files per unit test failures --- .../03-Sample-Applications/00-Sample-Applications-Overview.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb index e1d949b6..0d9ee739 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb @@ -10,6 +10,7 @@ "- [Knowledge Graph](#Knowledge-Graph)\n", "- [Identity Graph](#Identity-Graph)\n", "- [Security Graph](#Security-Graph)\n", + "- [Life Sciences](#Life-Sciences-Graph)\n", "- [Neptune ML - People Analytics](#Neptune-ML-People-Analytics)\n", "\n", "\n", From 2c5cc88c3677bd43f020ebd851421e8095eea592 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Wed, 3 May 2023 11:36:39 -0700 Subject: [PATCH 09/15] Update 00-Sample-Applications-Overview.ipynb Adding life sciences section to the overview.ipnb to match the read me per unit test failures --- .../00-Sample-Applications-Overview.ipynb | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb index 0d9ee739..05c569e4 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb @@ -42,6 +42,12 @@ "\n", "The examples in this notebook shows a sample security graph solution using an open dataset as well as data visualizations that allow one to better understand the structure of a security graph.\n", "\n", + "## [Life Sciences](./05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb)\n", + "\n", + "A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment.\n", + "\n", + "The examples in this notebook show how to take the SMILES format string for the molecule caffeine and transform it into a graph representation. The notebook also includes exploration and visualization of the caffeine molecule as a graph.\n", + "\n", "## [People Analytics using Machine Learning](../04-Machine-Learning/Sample-Applications/01-People-Analytics)\n", "\n", "Hiring and retaining good personnel is a key characteristic to making an organization successful. One way that organizations approach this problem is through the use of people analytics. People analytics allow business leaders to make data-driven decisions about personnel-related issues such as recruiting, evaluation, hiring, promotion, and retention, etc.\n", From 5ff567e9bf1a2cc65abc2fca550c2b6b90199aa2 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Wed, 3 May 2023 14:11:01 -0700 Subject: [PATCH 10/15] Update README.md Removed space at the end of description - should match now --- src/graph_notebook/notebooks/03-Sample-Applications/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/README.md b/src/graph_notebook/notebooks/03-Sample-Applications/README.md index eb7efd05..e48b74f9 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/README.md +++ b/src/graph_notebook/notebooks/03-Sample-Applications/README.md @@ -38,7 +38,7 @@ The examples in this notebook shows a sample security graph solution using an op ## [Life Sciences](./05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb) -A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment. +A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment. The examples in this notebook show how to take the SMILES format string for the molecule caffeine and transform it into a graph representation. The notebook also includes exploration and visualization of the caffeine molecule as a graph. From 7c81cfba149bb33b7bdfcb97fa5694dfd63eca89 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Thu, 4 May 2023 20:29:25 -0700 Subject: [PATCH 11/15] Update 01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb Adding in additional link for next steps section --- ...-Molecular-Structures-as-Graph-Data-Gremlin.ipynb | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb index 4bec2b7b..9b100775 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -552,7 +552,9 @@ "\n", "- [Building Amazon Neptune based MedDRA terminology mapping for pharmacovigilance and adverse event reporting](https://aws.amazon.com/blogs/industries/building-amazon-neptune-based-meddra-terminology-mapping-for-pharmacovigilance-and-adverse-event-reporting/)\n", "\n", - "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)" + "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)", + "\n", + "- [General Neptune Developer Resources](https://aws.amazon.com/neptune/developer-resources/)" ] }, { @@ -589,14 +591,6 @@ "for i in nodes_df['~id']:\n", " wr.neptune.execute_gremlin(client, \"g.V().has('~id', '\"+i+\"').drop();\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3c13d347", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 7c4d733cbd5c2f0cd67252debb9a9800a695a6b9 Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Thu, 4 May 2023 20:35:54 -0700 Subject: [PATCH 12/15] Update README.md matching to pass unit test - readme & sample applications overview diff From 1165245bdda58db05c14553c099a9694d0442b3b Mon Sep 17 00:00:00 2001 From: graham Date: Tue, 9 May 2023 03:17:48 -0700 Subject: [PATCH 13/15] changesv1 --- .../00-Sample-Applications-Overview.ipynb | 5 +++-- ...odeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb | 5 +++-- .../__init__.py | 0 .../notebooks/03-Sample-Applications/README.md | 6 +++--- 4 files changed, 9 insertions(+), 7 deletions(-) rename src/graph_notebook/notebooks/03-Sample-Applications/{05-Life-Science-Applications => 05-Healthcare-and-Life-Sciences-Graphs}/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb (99%) rename src/graph_notebook/notebooks/03-Sample-Applications/{05-Life-Science-Applications => 05-Healthcare-and-Life-Sciences-Graphs}/__init__.py (100%) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb index 05c569e4..11431283 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/00-Sample-Applications-Overview.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -10,7 +11,7 @@ "- [Knowledge Graph](#Knowledge-Graph)\n", "- [Identity Graph](#Identity-Graph)\n", "- [Security Graph](#Security-Graph)\n", - "- [Life Sciences](#Life-Sciences-Graph)\n", + "- [Healthcare and Life Sciences Graphs](#Healthcare-and-Life-Sciences-Graphs)\n", "- [Neptune ML - People Analytics](#Neptune-ML-People-Analytics)\n", "\n", "\n", @@ -42,7 +43,7 @@ "\n", "The examples in this notebook shows a sample security graph solution using an open dataset as well as data visualizations that allow one to better understand the structure of a security graph.\n", "\n", - "## [Life Sciences](./05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb)\n", + "## [Healthcare and Life Sciences Graphs](./05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb)\n", "\n", "A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment.\n", "\n", diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb similarity index 99% rename from src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb rename to src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb index 9b100775..84ff35a0 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -9,11 +9,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e147f141", "metadata": {}, "source": [ - "# Model molecular SMILES data with Amazon Neptune and RDKit" + "# Modeling molecular SMILES data with Amazon Neptune and RDKit" ] }, { @@ -552,7 +553,7 @@ "\n", "- [Building Amazon Neptune based MedDRA terminology mapping for pharmacovigilance and adverse event reporting](https://aws.amazon.com/blogs/industries/building-amazon-neptune-based-meddra-terminology-mapping-for-pharmacovigilance-and-adverse-event-reporting/)\n", "\n", - "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)", + "- [Building and querying the AWS COVID-19 knowledge graph](https://aws.amazon.com/blogs/database/building-and-querying-the-aws-covid-19-knowledge-graph/)\n", "\n", "- [General Neptune Developer Resources](https://aws.amazon.com/neptune/developer-resources/)" ] diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/__init__.py b/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/__init__.py similarity index 100% rename from src/graph_notebook/notebooks/03-Sample-Applications/05-Life-Science-Applications/__init__.py rename to src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/__init__.py diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/README.md b/src/graph_notebook/notebooks/03-Sample-Applications/README.md index e48b74f9..09e4cb59 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/README.md +++ b/src/graph_notebook/notebooks/03-Sample-Applications/README.md @@ -4,7 +4,7 @@ - [Knowledge Graph](#Knowledge-Graph) - [Identity Graph](#Identity-Graph) - [Security Graph](#Security-Graph) -- [Life Sciences](#Life-Sciences-Graph) +- [Healthcare and Life Sciences Graphs](#Healthcare-and-Life-Sciences-Graphs) - [Neptune ML - People Analytics](#Neptune-ML-People-Analytics) @@ -36,7 +36,7 @@ A security graph connects resources within a network: entities such as policies, The examples in this notebook shows a sample security graph solution using an open dataset as well as data visualizations that allow one to better understand the structure of a security graph. -## [Life Sciences](./05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb) +## [Healthcare and Life Sciences Graphs](./05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb) A molecular modeling graph which represents chemical structures as graph data. Visualizing atoms as nodes, and the bonds between atoms as edges, enables users to explore chemical structures in a persistent environment. Using the popular open-source bioinformatics package RDKit, and a molecule represented as a SMILES string, walk through the process of transforming a molecule into a graph. Also, walk through a simple example of using packages in the graph-notebook environment. @@ -46,4 +46,4 @@ The examples in this notebook show how to take the SMILES format string for the Hiring and retaining good personnel is a key characteristic to making an organization successful. One way that organizations approach this problem is through the use of people analytics. People analytics allow business leaders to make data-driven decisions about personnel-related issues such as recruiting, evaluation, hiring, promotion, and retention, etc. -The examples in this notebook shows a sample people analytics graph solution using an open dataset that incorporates graph neural network based machine learning using Neptune ML. +The examples in this notebook shows a sample people analytics graph solution using an open dataset that incorporates graph neural network based machine learning using Neptune ML. \ No newline at end of file From a6c9f6eb9fd31004417f793118f1e7923530472b Mon Sep 17 00:00:00 2001 From: gkutchekAWS <99388965+gkutchekAWS@users.noreply.github.com> Date: Tue, 9 May 2023 03:22:19 -0700 Subject: [PATCH 14/15] Update test_validate_notebooks.py added to tests --- test/unit/notebooks/test_validate_notebooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/notebooks/test_validate_notebooks.py b/test/unit/notebooks/test_validate_notebooks.py index d185e06c..a9b39338 100644 --- a/test/unit/notebooks/test_validate_notebooks.py +++ b/test/unit/notebooks/test_validate_notebooks.py @@ -40,7 +40,7 @@ def test_no_extra_notebooks(self): f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/03-Identity-Graphs/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL/03-Jumpstart-Identity-Graphs-Using-Canonical-Model-and-ETL.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/04-Security-Graphs/01-Building-a-Security-Graph-Application-with-openCypher.ipynb', - f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/05-Life-Science-Applications/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb', + f'{NOTEBOOK_BASE_DIR}/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-00-Getting-Started-with-Neptune-ML-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-01-Introduction-to-Node-Classification-Gremlin.ipynb', f'{NOTEBOOK_BASE_DIR}/04-Machine-Learning/Neptune-ML-02-Introduction-to-Node-Regression-Gremlin.ipynb', From f70a1559f68a59e1508bc56906ecff1c155c949f Mon Sep 17 00:00:00 2001 From: Joy Wang <73913166+joywa@users.noreply.github.com> Date: Thu, 11 May 2023 10:02:53 -0700 Subject: [PATCH 15/15] Update 01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb typos --- ...-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb b/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb index 84ff35a0..9d0ce419 100644 --- a/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb +++ b/src/graph_notebook/notebooks/03-Sample-Applications/05-Healthcare-and-Life-Sciences-Graphs/01-Modeling-Molecular-Structures-as-Graph-Data-Gremlin.ipynb @@ -443,7 +443,7 @@ "id": "d26e71fd", "metadata": {}, "source": [ - "The next two cells use the `.to_proprty_graph` functions within awswrangler to insert both the node & edge data frames we created earlier into our Neptune database. Both cells should return a `\"True\"` upon success." + "The next two cells use the `.to_property_graph` functions within awswrangler to insert both the node & edge data frames we created earlier into our Neptune database. Both cells should return a `\"True\"` upon success." ] }, { @@ -471,7 +471,7 @@ "id": "8064ef96", "metadata": {}, "source": [ - "After recieving a `\"True\"` output from both cells, you are finished with processing your SMILES molecule string. Now you can move onto visualizing your molecule as graph data. If you wish to add additional compounds to your graph database, you can return to the start of the [RDKit Processing](#RDKit-Processing) section and simply redo the process with a different SMILES string." + "After receiving a `\"True\"` output from both cells, you are finished with processing your SMILES molecule string. Now you can move onto visualizing your molecule as graph data. If you wish to add additional compounds to your graph database, you can return to the start of the [RDKit Processing](#RDKit-Processing) section and simply redo the process with a different SMILES string." ] }, {