opentargets · ireneisdoomed · Sep 15, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 8, 2021
diff --git a/common/ontology.py b/common/ontology.py
@@ -0,0 +1,61 @@
+import logging
+import random
+import time
+
+from ontoma.interface import OnToma
+from pandarallel import pandarallel
+
+ONTOMA_MAX_ATTEMPTS = 5
+pandarallel.initialize()
+
+
+def _ontoma_udf(row, ontoma_instance):
+    disease_name, disease_id = row['diseaseFromSource'], row['diseaseFromSourceId']
+    for attempt in range(1, ONTOMA_MAX_ATTEMPTS + 1):
+        # Try to map first by disease name (because that branch of OnToma is more stable), then by disease ID.
+        try:
+            mappings = []
+            if disease_name:
+                mappings = ontoma_instance.find_term(query=disease_name, code=False)
+            if disease_id and not mappings:
+                mappings = ontoma_instance.find_term(query=disease_id, code=True)
+            return [m.id_ot_schema for m in mappings]
+        except:
+            # If this is not the last attempt, wait until the next one
+            if attempt != ONTOMA_MAX_ATTEMPTS:
+                time.sleep(10 + 30 * random.random())
+    logging.error(f'OnToma lookup failed for {disease_name!r} / {disease_id!r}')
+    return []
+
+
+def add_efo_mapping(evidence_strings, spark_instance, ontoma_cache_dir=None):
+    """Given evidence strings with diseaseFromSource and diseaseFromSourceId fields, try to populate EFO mapping
+    field diseaseFromSourceMappedId. In case there are multiple matches, the evidence strings will be exploded
+    accordingly.
+
+    Currently, both source columns (diseaseFromSource and diseaseFromSourceId) need to be present in the original
+    schema, although they do not have to be populated for all rows."""
+    logging.info('Collect all distinct (disease name, disease ID) pairs.')
+    disease_info_to_map = (
+        evidence_strings
+        .select('diseaseFromSource', 'diseaseFromSourceId')
+        .distinct()
+        .toPandas()
+    )
+
+    logging.info('Initialise OnToma instance')
+    ontoma_instance = OnToma(cache_dir=ontoma_cache_dir)
+
+    logging.info('Map disease information to EFO.')
+    disease_info_to_map['diseaseFromSourceMappedId'] = disease_info_to_map.parallel_apply(
+        _ontoma_udf, args=(ontoma_instance,), axis=1
+    )
+    disease_info_to_map = disease_info_to_map.explode('diseaseFromSourceMappedId')
+
+    logging.info('Join the resulting information into the evidence strings.')
+    disease_info_df = spark_instance.createDataFrame(disease_info_to_map.astype(str))
+    return evidence_strings.join(
+        disease_info_df,
+        on=['diseaseFromSource', 'diseaseFromSourceId'],
+        how='left'
+    )
diff --git a/envs/environment-lock.yml b/envs/environment-lock.yml
@@ -249,5 +249,7 @@ dependencies:
   - zlib=1.2.11
   - zstd=1.5.0
   - pip:
+    - dill==0.3.4
     - obonet==0.3.0
-    - ontoma==0.0.17
+    - ontoma==1.0.0
+    - pandarallel==1.5.2
diff --git a/envs/environment.yml b/envs/environment.yml
@@ -15,4 +15,5 @@ dependencies:
   - snakemake==6.0.0
   - tqdm=4.58.0
   - pip:
-    - ontoma==0.0.17
+    - ontoma==1.0.0
+    - pandarallel==1.5.2
diff --git a/modules/PanelApp.py b/modules/PanelApp.py
@@ -13,6 +13,8 @@
     array, array_distinct, col, collect_set, concat, explode, lit, regexp_extract, regexp_replace, split, trim, when
 )
 
+from common.ontology import add_efo_mapping
+
 
 class PanelAppEvidenceGenerator:
 
@@ -238,6 +240,9 @@ def generate_panelapp_evidence(
             .distinct()
         )
 
+        # Add EFO mapping information.
+        panelapp_df = add_efo_mapping(evidence_strings=panelapp_df, spark_instance=self.spark)
+
         logging.info('Save data.')
         with tempfile.TemporaryDirectory() as tmp_dir_name:
             (

diff --git a/modules/PhenoDigm.py b/modules/PhenoDigm.py
@@ -16,6 +16,8 @@
 import requests
 from retry import retry
 
+from common.ontology import add_efo_mapping
+
 
 # The tables and their fields to fetch from SOLR. Other tables (not currently used): gene, disease_gene_summary.
 IMPC_SOLR_TABLES = {
@@ -410,13 +412,18 @@ def generate_phenodigm_evidence_strings(self, score_cutoff):
             # Add constant value columns.
             .withColumn('datasourceId', pf.lit('phenodigm'))
             .withColumn('datatypeId', pf.lit('animal_model'))
+        )
 
-            # Ensure stable column order.
-            .select('biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
-                    'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId',
-                    'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
-                    'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId',
-                    'targetInModelMgiId')
+        # Add EFO mapping information.
+        self.evidence = add_efo_mapping(evidence_strings=self.evidence, spark_instance=self.spark,
+                                        ontoma_cache_dir=self.cache_dir)
+
+        # Ensure stable column order.
+        self.evidence = self.evidence.select(
+            'biologicalModelAllelicComposition', 'biologicalModelGeneticBackground', 'biologicalModelId',
+            'datasourceId', 'datatypeId', 'diseaseFromSource', 'diseaseFromSourceId', 'diseaseFromSourceMappedId',
+            'diseaseModelAssociatedHumanPhenotypes', 'diseaseModelAssociatedModelPhenotypes', 'literature',
+            'resourceScore', 'targetFromSourceId', 'targetInModel', 'targetInModelEnsemblId', 'targetInModelMgiId'
         )
 
     def generate_mouse_phenotypes_dataset(self):