Azure-Samples · srungta · Apr 22, 2018
diff --git a/modules/phrase_learning.py b/modules/phrase_learning.py
@@ -92,7 +92,7 @@ def CleanAndSplitText(frame):
                     # If the last word ends in a period then remove the period
                     lastWord = regexPeriod.sub("", words[last])
                     # If the last word is an abbreviation like "U.S."
-                    # then add the word final perios back on
+                    # then add the word final period back on
                     if "\." in lastWord:
                         lastWord += "."
                     phraseOut += lastWord    
@@ -105,7 +105,7 @@ def CleanAndSplitText(frame):
 
     return frameOut
 
-# count the number of occurances of all 2-gram, 3-ngram, and 4-gram word sequences.
+# count the number of occurrences of all 2-gram, 3-ngram, and 4-gram word sequences.
 def ComputeNgramStats(textData,functionwordHash,blacklistHash):
 
     # Create an array to store the total count of all ngrams up to 4-grams
@@ -124,7 +124,7 @@ def ComputeNgramStats(textData,functionwordHash,blacklistHash):
     # for phrase modeling. The expression says words in phrases
     # must either:
     # (1) contain an alphabetic character, or 
-    # (2) be the single charcater '&', or
+    # (2) be the single character '&', or
     # (3) be a one or two digit number
     reWordIsValid = re.compile('[A-Za-z]|^&$|^\d\d?$')
 
@@ -154,7 +154,7 @@ def ComputeNgramStats(textData,functionwordHash,blacklistHash):
             word = wordArray[j]
 
             # Only bother counting the ngrams that start with a valid content word
-            # i.e., valids words not in the function word list or the black list
+            # i.e., valid words not in the function word list or the black list
             if ( ( word not in functionwordHash ) and ( word not in blacklistHash ) and validArray[j] ):
 
                 # Initialize ngram string with first content word and add it to unigram counts
@@ -236,7 +236,7 @@ def ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases,
 
     # This function will consider at most maxRewrite 
     # new phrases to be added into the learned phrase 
-    # list as specified by the calling fuinction
+    # list as specified by the calling function
     maxRewrite=maxPhrasesToAdd
 
     # If the remaining number of proposed ngram phrases is less 
@@ -475,7 +475,7 @@ def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules):
                 rightConflictHash[leftWord] = 1
                 prevConflictHash[outputPhrase] = 1           
 
-                # Add extra space to input an output versions of the current phrase 
+                # Add extra space to input and output versions of the current phrase 
                 # to make the regex rewrite easier
                 outputPhrase = " " + outputPhrase
                 lastAddedPhrase = " " + nextPhrase
@@ -509,7 +509,7 @@ def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules):
 
             # Apply the regex over the full data set
             for i in range(0,numLines):
-                # The regex substituion looks up the output string rewrite  
+                # The regex substitution looks up the output string rewrite  
                 # in the hash table for each matched input phrase regex
                 textOutput[i] = regexPhrase.sub(lambda mo: phraseRewriteHash[mo.string[mo.start():mo.end()]], textOutput[i]) 
 

diff --git a/notebooks/Part_1_Data_Preparation.ipynb b/notebooks/Part_1_Data_Preparation.ipynb
@@ -80,7 +80,7 @@
     "| answers (A) | Id | String | The unique answer ID (primary key)\n",
     "|  | text0 | String | The raw text data of the answer\n",
     "\n",
-    "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retreive the data in the notebook."
+    "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retrieve the data in the notebook."
    ]
   },
   {
@@ -236,7 +236,7 @@
     }
    ],
    "source": [
-    "# This text include the HTML code.\n",
+    "# This text includes the HTML code.\n",
     "print(questions[\"Text0\"][220231])"
    ]
   },
@@ -359,7 +359,7 @@
    },
    "outputs": [],
    "source": [
-    "# find the AnswerIds has at least 3 dupes.\n",
+    "# find the AnswerIds that have at least 3 dupes.\n",
     "def find_answerId(answersC, dupesC, num_dupes):\n",
     "       \n",
     "    countHash = {}\n",

diff --git a/notebooks/Part_2_Phrase_Learning.ipynb b/notebooks/Part_2_Phrase_Learning.ipynb
@@ -135,7 +135,7 @@
     "\n",
     "The CleanAndSplitText function from __phrase_learning__ takes as input a list where each row element is a single cohesive long string of text, i.e. a \"question\". The function first splits each string by various forms of punctuation into chunks of text that are likely sentences, phrases or sub-phrases. The splitting is designed to prohibit the phrase learning process from using cross-sentence or cross-phrase word strings when learning phrases.\n",
     "\n",
-    "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Taxt` column contains a fully lower-cased version of the text in the `CleanedText` column."
+    "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Text` column contains a fully lower-cased version of the text in the `CleanedText` column."
    ]
   },
   {

diff --git a/notebooks/Part_3_Model_Training_and_Evaluation.ipynb b/notebooks/Part_3_Model_Training_and_Evaluation.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# Part 3: Model Training and Evaluation\n",
     "\n",
-    "If you haven't complete the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n",
+    "If you haven't completed the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n",
     "\n",
     "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n",
     "\n",
@@ -241,7 +241,7 @@
    "source": [
     "### Support Vector Machine (TF-IDF as features)\n",
     "\n",
-    "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally seperates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n",
+    "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally separates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n",
     "\n",
     "The `sklearn` Python package implement such a classifier and we use the implementation in this example. More information about this `LinearSVC` classifier can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)."
    ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -135,7 +135,7 @@ @@
         "\n",
         "The CleanAndSplitText function from __phrase_learning__ takes as input a list where each row element is a single cohesive long string of text, i.e. a \"question\". The function first splits each string by various forms of punctuation into chunks of text that are likely sentences, phrases or sub-phrases. The splitting is designed to prohibit the phrase learning process from using cross-sentence or cross-phrase word strings when learning phrases.\n",
         "\n",
-        "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Taxt` column contains a fully lower-cased version of the text in the `CleanedText` column."
+        "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Text` column contains a fully lower-cased version of the text in the `CleanedText` column."
        ]
       },
       {
@@ Expand Down @@