Machine-Learning-for-Medical-Language · comorbidity · Sep 26, 2022 · Sep 26, 2022 · Sep 26, 2022 · Sep 26, 2022
diff --git a/ctakesclient/__init__.py b/ctakesclient/__init__.py
@@ -1,6 +1,9 @@
 """Public API"""
 
 from . import typesystem
+from .typesystem import CtakesJSON, MatchText, Polarity, Span
+from .typesystem import UmlsTypeMention, UmlsConcept
+
 from . import filesystem
 from . import client
 from . import transformer
diff --git a/ctakesclient/client.py b/ctakesclient/client.py
@@ -24,12 +24,29 @@ def get_url_ctakes_rest() -> str:
         'URL_CTAKES_REST',
         'http://localhost:8080/ctakes-web-rest/service/analyze')
 
+def ascii(sentence:str) -> bytes:
+    """
+    https://pythonguides.com/remove-unicode-characters-in-python/#:~:text=In%20python%2C%20to%20remove%20non,decode().
+    :param sentence:
+    :return: sentence REMOVED of unicode chars - note LENGTH of text input may change!
+    """
+    return sentence.encode("ascii", "ignore")
+
+def utf8(sentence:str) -> bytes:
+    """
+    https://stackoverflow.com/questions/34618149/post-unicode-string-to-web-service-using-python-requests-library
+    :param sentence:
+    :return: sentence ESCAPED unicode chars - note LENGTH of text input may change!
+    """
+    # import re
+    # return re.sub(r'[^\u1F600-\u1F64F ]|[^\u1F300-\u1F5FF ]', " ", sentence)
+    return sentence.encode("utf-8")
 
 def post(sentence: str, url=get_url_ctakes_rest()) -> dict:
     """
     :param sentence: clinical text to send to cTAKES
     :param url: cTAKES REST server fully qualified path
-    :return:
+    :return: dict JSON response from cTAKES
     """
     logging.debug(url)
     # TODO: consider exposing a pass-through timeout parameter

diff --git a/ctakesclient/typesystem.py b/ctakesclient/typesystem.py
@@ -1,10 +1,19 @@
 """UMLS (Unified Medical Language System)"""
 import json
 import logging
+from collections import OrderedDict
 from typing import List
 from enum import Enum
 
+###############################################################################
+#
+# UMLS Unified Medical Language System
+#
+###############################################################################
 
+###############################################################################
+# UMLS Semantic Types
+###############################################################################
 class UmlsTypeMention(Enum):
     """
     Semantic Types in the UMLS (Unified Medical Language System)
@@ -19,7 +28,9 @@ class UmlsTypeMention(Enum):
     Procedure = 'ProcedureMention'
     CustomDict = 'IdentifiedAnnotation'
 
-
+###############################################################################
+# UMLS Concepts
+###############################################################################
 class UmlsConcept:
     """Concept in the UMLS (Unified Medical Language System)"""
 
@@ -78,6 +89,10 @@ def __str__(self):
 ###############################################################################
 
 
+###############################################################################
+# Match: Text, Span, Polarity
+###############################################################################
+
 class Polarity(Enum):
     """"
     Polarity means "negation" like "patient denies cough".
@@ -130,45 +145,18 @@ def __init__(self, source=None):
     def span(self) -> Span:
         return Span(self.begin, self.end)
 
-    @staticmethod
-    def parse_polarity(polarity) -> Polarity:
-        if isinstance(polarity, Polarity):
-            return polarity
-        elif polarity == -1:
-            return Polarity.neg
-        elif polarity == 0:
-            return Polarity.pos
-        else:
-            raise Exception(f'polarity unknown: {polarity}')
-
-    @staticmethod
-    def parse_mention(mention: str) -> UmlsTypeMention:
-        if mention == 'IdentifiedAnnotation':
-            return UmlsTypeMention.CustomDict
-        else:
-            return UmlsTypeMention[mention.replace('Mention', '')]
-
-    @staticmethod
-    def sort_concepts(unsorted: List[UmlsConcept]) -> List[UmlsConcept]:
-        """
-        :param unsorted: guarantees responses from ctakes server are
-                         identically ordered
-        :return: sorted list of concepts.
-        """
-        return sorted(unsorted, key=UmlsConcept.as_string)
-
     def from_json(self, source: dict):
         self.begin = source.get('begin')
         self.end = source.get('end')
         self.text = source.get('text')
-        self.type = self.parse_mention(source.get('type'))
-        self.polarity = self.parse_polarity(source.get('polarity'))
+        self.type = parse_mention(source.get('type'))
+        self.polarity = parse_polarity(source.get('polarity'))
         self.conceptAttributes = []
 
         # sort list of concepts ensuring same ordering
         unsorted = list(UmlsConcept(c) for c in source.get('conceptAttributes'))
 
-        for c in MatchText.sort_concepts(unsorted):
+        for c in sort_concepts(unsorted):
             self.conceptAttributes.append(c)
 
     def as_json(self):
@@ -183,6 +171,9 @@ def as_json(self):
             'type': self.type.value
         }
 
+###############################################################################
+# Match: Text, Span, Polarity
+###############################################################################
 
 class CtakesJSON:
     """Ctakes JSON contain MatchText with list of UmlsConcept"""
@@ -204,6 +195,9 @@ def list_concept_cui(self, polarity=None) -> List[str]:
     def list_concept_tui(self, polarity=None) -> List[str]:
         return [c.tui for c in self.list_concept(polarity)]
 
+    def list_concept_vocab(self, polarity=None) -> List[str]:
+        return [c.codingScheme for c in self.list_concept(polarity)]
+
     def list_concept_code(self, polarity=None) -> List[str]:
         return [c.code for c in self.list_concept(polarity)]
 
@@ -220,7 +214,7 @@ def list_match(self,
                       filter_umls_type)
 
         if polarity is not None:
-            polarity = MatchText.parse_polarity(polarity)
+            polarity = parse_polarity(polarity)
 
         concat = []
         for semtype, matches in self.mentions.items():
@@ -238,6 +232,11 @@ def list_match_text(self, polarity=None) -> List[str]:
             m.text
             for m in self.list_match(polarity=polarity, filter_umls_type=None))
 
+    def list_match_type(self, polarity=None) -> List[str]:
+        return list(
+            m.type
+            for m in self.list_match(polarity=polarity, filter_umls_type=None))
+
     def list_sign_symptom(self, polarity=None) -> List[MatchText]:
         return self.list_match(polarity, UmlsTypeMention.SignSymptom)
 
@@ -256,9 +255,27 @@ def list_anatomical_site(self, polarity=None) -> List[MatchText]:
     def list_identified_annotation(self, polarity=None) -> List[MatchText]:
         return self.list_match(polarity, UmlsTypeMention.CustomDict)
 
+    def term_freq_summary(self) -> dict:
+        """
+        :return: dictionary of term frequency calculations
+        """
+        terms_cui = self.list_concept_cui()
+        terms_code = self.list_concept_code()
+        terms_vocab = self.list_concept_vocab()
+        terms_text = self.list_match_text()
+        terms_polarity = [p.name for p in self.list_polarity(self.list_match())]
+        terms_type = [t.name for t in self.list_match_type()]
+
+        return {'text': CtakesJSON.term_freq(terms_text),
+                'cui': CtakesJSON.term_freq(terms_cui),
+                'code': CtakesJSON.term_freq(terms_code),
+                'vocab': CtakesJSON.term_freq(terms_vocab),
+                'type': CtakesJSON.term_freq(terms_type),
+                'polarity': CtakesJSON.term_freq(terms_polarity)}
+
     def from_json(self, source: dict) -> None:
         for mention, match_list in source.items():
-            semtype = MatchText.parse_mention(mention)
+            semtype = parse_mention(mention)
 
             if semtype not in self.mentions:
                 self.mentions[semtype] = []
@@ -273,3 +290,64 @@ def as_json(self):
 
             res[mention.value] = match_json
         return res
+
+
+###############################################################################
+#
+# Helper Functions
+#
+###############################################################################
+
+###############################################################################
+# Parsing Types
+###############################################################################
+
+def parse_polarity(polarity) -> Polarity:
+    if isinstance(polarity, Polarity):
+        return polarity
+    elif polarity == -1:
+        return Polarity.neg
+    elif polarity == 0:
+        return Polarity.pos
+    else:
+        raise Exception(f'polarity unknown: {polarity}')
+
+def parse_mention(mention: str) -> UmlsTypeMention:
+    if mention == 'IdentifiedAnnotation':
+        return UmlsTypeMention.CustomDict
+    else:
+        return UmlsTypeMention[mention.replace('Mention', '')]
+
+###############################################################################
+# Sorting
+###############################################################################
+
+def sort_concepts(unsorted: List[UmlsConcept]) -> List[UmlsConcept]:
+    """
+    :param unsorted: guarantees responses from ctakes server are
+                     identically ordered
+    :return: sorted list of concepts.
+    """
+    return sorted(unsorted, key=UmlsConcept.as_string)
+
+###############################################################################
+# Term Frequency
+###############################################################################
+
+def term_freq(term_list: list) -> dict:
+    """
+    Term Frequency calculation
+    :param term_list: String, CUI, CODE, or other "term"
+    :return: OrderedDict sorted by count descending
+    """
+    tf = dict()
+    for term in term_list:
+        if term not in tf.keys():
+            tf[term] = 1
+        else:
+            tf[term] += 1
+
+    ordered = OrderedDict()
+    for k in sorted(tf, key=tf.get, reverse=True):
+        ordered[k] = tf[k]
+    return ordered
diff --git a/test-integration/__init__.py b/test-integration/__init__.py
diff --git a/test/test_client_covid_symptoms.py → ...integration/test_client_covid_symptoms.py b/test/test_client_covid_symptoms.py → ...integration/test_client_covid_symptoms.py
@@ -3,7 +3,7 @@
 import json
 import unittest
 import ctakesclient
-from .test_resources import LoadResource
+from test.test_resources import LoadResource
 
 
 def pretty(result: dict):
@@ -26,8 +26,8 @@ def test_covid_symptoms_medical_synonyms(self):
             'Myalgias': 'Muscle aches and pain',
             'Chills': 'Fever or chills',
             'Post-tussive': 'after Coughing',
-            'tussive':'related to Coughing',
-            'Pharyngitis':'sore throat',
+            'tussive': 'related to Coughing',
+            'Pharyngitis': 'sore throat',
             'Odynophagia': 'sore throat',
             'Loss of taste': 'Anosmia',
             'Loss of smell': 'Anosmia',

diff --git a/test/test_client_ctakes_rest_server.py → ...gration/test_client_ctakes_rest_server.py b/test/test_client_ctakes_rest_server.py → ...gration/test_client_ctakes_rest_server.py
@@ -2,7 +2,7 @@
 
 import unittest
 import ctakesclient
-from .test_resources import LoadResource
+from test.test_resources import LoadResource
 
 class TestClientCtakesRestServer(unittest.TestCase):
     """Test case for REST requests"""