diff --git a/modules/phrase_learning.py b/modules/phrase_learning.py index 444c027..bbdf55e 100644 --- a/modules/phrase_learning.py +++ b/modules/phrase_learning.py @@ -92,7 +92,7 @@ def CleanAndSplitText(frame): # If the last word ends in a period then remove the period lastWord = regexPeriod.sub("", words[last]) # If the last word is an abbreviation like "U.S." - # then add the word final perios back on + # then add the word final period back on if "\." in lastWord: lastWord += "." phraseOut += lastWord @@ -105,7 +105,7 @@ def CleanAndSplitText(frame): return frameOut -# count the number of occurances of all 2-gram, 3-ngram, and 4-gram word sequences. +# count the number of occurrences of all 2-gram, 3-ngram, and 4-gram word sequences. def ComputeNgramStats(textData,functionwordHash,blacklistHash): # Create an array to store the total count of all ngrams up to 4-grams @@ -124,7 +124,7 @@ def ComputeNgramStats(textData,functionwordHash,blacklistHash): # for phrase modeling. The expression says words in phrases # must either: # (1) contain an alphabetic character, or - # (2) be the single charcater '&', or + # (2) be the single character '&', or # (3) be a one or two digit number reWordIsValid = re.compile('[A-Za-z]|^&$|^\d\d?$') @@ -154,7 +154,7 @@ def ComputeNgramStats(textData,functionwordHash,blacklistHash): word = wordArray[j] # Only bother counting the ngrams that start with a valid content word - # i.e., valids words not in the function word list or the black list + # i.e., valid words not in the function word list or the black list if ( ( word not in functionwordHash ) and ( word not in blacklistHash ) and validArray[j] ): # Initialize ngram string with first content word and add it to unigram counts @@ -236,7 +236,7 @@ def ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases, # This function will consider at most maxRewrite # new phrases to be added into the learned phrase - # list as specified by the calling fuinction + # list as specified by the calling function maxRewrite=maxPhrasesToAdd # If the remaining number of proposed ngram phrases is less @@ -475,7 +475,7 @@ def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules): rightConflictHash[leftWord] = 1 prevConflictHash[outputPhrase] = 1 - # Add extra space to input an output versions of the current phrase + # Add extra space to input and output versions of the current phrase # to make the regex rewrite easier outputPhrase = " " + outputPhrase lastAddedPhrase = " " + nextPhrase @@ -509,7 +509,7 @@ def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules): # Apply the regex over the full data set for i in range(0,numLines): - # The regex substituion looks up the output string rewrite + # The regex substitution looks up the output string rewrite # in the hash table for each matched input phrase regex textOutput[i] = regexPhrase.sub(lambda mo: phraseRewriteHash[mo.string[mo.start():mo.end()]], textOutput[i]) diff --git a/notebooks/Part_1_Data_Preparation.ipynb b/notebooks/Part_1_Data_Preparation.ipynb index ec24aa0..fede2ac 100644 --- a/notebooks/Part_1_Data_Preparation.ipynb +++ b/notebooks/Part_1_Data_Preparation.ipynb @@ -80,7 +80,7 @@ "| answers (A) | Id | String | The unique answer ID (primary key)\n", "| | text0 | String | The raw text data of the answer\n", "\n", - "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retreive the data in the notebook." + "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retrieve the data in the notebook." ] }, { @@ -236,7 +236,7 @@ } ], "source": [ - "# This text include the HTML code.\n", + "# This text includes the HTML code.\n", "print(questions[\"Text0\"][220231])" ] }, @@ -359,7 +359,7 @@ }, "outputs": [], "source": [ - "# find the AnswerIds has at least 3 dupes.\n", + "# find the AnswerIds that have at least 3 dupes.\n", "def find_answerId(answersC, dupesC, num_dupes):\n", " \n", " countHash = {}\n", diff --git a/notebooks/Part_2_Phrase_Learning.ipynb b/notebooks/Part_2_Phrase_Learning.ipynb index 9ec06ad..370b52c 100644 --- a/notebooks/Part_2_Phrase_Learning.ipynb +++ b/notebooks/Part_2_Phrase_Learning.ipynb @@ -135,7 +135,7 @@ "\n", "The CleanAndSplitText function from __phrase_learning__ takes as input a list where each row element is a single cohesive long string of text, i.e. a \"question\". The function first splits each string by various forms of punctuation into chunks of text that are likely sentences, phrases or sub-phrases. The splitting is designed to prohibit the phrase learning process from using cross-sentence or cross-phrase word strings when learning phrases.\n", "\n", - "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Taxt` column contains a fully lower-cased version of the text in the `CleanedText` column." + "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Text` column contains a fully lower-cased version of the text in the `CleanedText` column." ] }, { diff --git a/notebooks/Part_3_Model_Training_and_Evaluation.ipynb b/notebooks/Part_3_Model_Training_and_Evaluation.ipynb index 5e63780..52fe136 100644 --- a/notebooks/Part_3_Model_Training_and_Evaluation.ipynb +++ b/notebooks/Part_3_Model_Training_and_Evaluation.ipynb @@ -6,7 +6,7 @@ "source": [ "# Part 3: Model Training and Evaluation\n", "\n", - "If you haven't complete the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n", + "If you haven't completed the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n", "\n", "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n", "\n", @@ -241,7 +241,7 @@ "source": [ "### Support Vector Machine (TF-IDF as features)\n", "\n", - "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally seperates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n", + "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally separates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n", "\n", "The `sklearn` Python package implement such a classifier and we use the implementation in this example. More information about this `LinearSVC` classifier can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)." ]