From 5848764219314ea51e7e296f9c189976511a61ae Mon Sep 17 00:00:00 2001 From: aamir Date: Mon, 21 Feb 2022 22:47:40 +0530 Subject: [PATCH 1/6] update -- added functionality to extract distinct keywords from a list of input strings --- src/distinct_keywords/keywords.py | 38 +++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/distinct_keywords/keywords.py b/src/distinct_keywords/keywords.py index fc07585..1bb0b87 100644 --- a/src/distinct_keywords/keywords.py +++ b/src/distinct_keywords/keywords.py @@ -53,10 +53,16 @@ def __get_wordnet_count(self,word): except: return 0 - def get_keywords(self,input_document:str,min_length=2,include_proper_nouns=True,max_proper_noun_count=5): - doc=self.__preprocess_no_lemmatization(input_document) + def get_keywords_from_text(self, + input_document:str, + doc, + min_length=2, + include_proper_nouns=True, + max_proper_noun_count=5): + + input_text=self.__preprocess_no_lemmatization(input_document) trie=Trie(string.ascii_lowercase+string.digits) - for word in doc.split(): + for word in input_text.split(): if word in self.stop_words: continue try: @@ -79,10 +85,32 @@ def get_keywords(self,input_document:str,min_length=2,include_proper_nouns=True, keywords.append(trie[i].popitem(index=-1)[0]) keywords=[i.replace('_',' ') for i in keywords if i in input_document] if include_proper_nouns: - proper_nouns=[strip_multiple_whitespaces(strip_non_alphanum(tok.text)) for tok in self.nlp(input_document).noun_chunks] + proper_nouns=[strip_multiple_whitespaces(strip_non_alphanum(tok.text)) for tok in doc.noun_chunks] proper_nouns=[i for i in proper_nouns if i.lower() not in self.stop_words] top_proper_nouns={i[0] for i in Counter(proper_nouns).most_common(max_proper_noun_count)} return list(set(keywords).union(top_proper_nouns)) return keywords - \ No newline at end of file + def get_keywords(self, + input_documents, + min_length=2, + include_proper_nouns=True, + max_proper_noun_count=5): + + if isinstance(input_documents, str): + input_documents = [input_documents] + + keywords_for_all_input_samples = [] + + for doc in self.nlp.pipe(input_documents): + + input_document = doc.text + document_keywords = self.get_keywords_from_text(input_document=input_document, + doc=doc, + min_length=min_length, + include_proper_nouns=include_proper_nouns, + max_proper_noun_count=max_proper_noun_count) + + keywords_for_all_input_samples.append(document_keywords) + + return keywords_for_all_input_samples \ No newline at end of file From 50f37ecc6d54df3a9784bd522fb0dfc766dab3cb Mon Sep 17 00:00:00 2001 From: aamir Date: Mon, 21 Feb 2022 22:59:14 +0530 Subject: [PATCH 2/6] update -- updated test notebook with example to extract from list of strings --- keyword_test.ipynb | 161 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 122 insertions(+), 39 deletions(-) diff --git a/keyword_test.ipynb b/keyword_test.ipynb index 17f276b..35d03f8 100644 --- a/keyword_test.ipynb +++ b/keyword_test.ipynb @@ -40,39 +40,39 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['mapping',\n", - " 'learning',\n", - " 'a function',\n", - " 'scenario',\n", - " 'pairs',\n", - " 'reasonable',\n", - " 'the machine',\n", - " 'vector',\n", - " 'task',\n", - " 'inductive',\n", - " 'typically',\n", - " 'called',\n", - " 'bias',\n", - " 'examples',\n", - " 'supervisory',\n", - " 'maps',\n", - " 'training',\n", - " 'unseen',\n", - " ' Supervised learning',\n", - " 'machine learning',\n", - " 'algorithm',\n", - " 'based',\n", - " 'supervised',\n", - " 'the training data']" + "[['typically',\n", + " 'unseen',\n", + " 'supervised',\n", + " 'called',\n", + " 'the machine',\n", + " 'vector',\n", + " 'maps',\n", + " 'bias',\n", + " 'a function',\n", + " 'mapping',\n", + " 'examples',\n", + " 'algorithm',\n", + " 'inductive',\n", + " 'learning',\n", + " 'machine learning',\n", + " 'scenario',\n", + " 'based',\n", + " 'training',\n", + " 'task',\n", + " 'reasonable',\n", + " ' Supervised learning',\n", + " 'the training data',\n", + " 'pairs',\n", + " 'supervisory']]" ] }, - "execution_count": 9, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -83,7 +83,90 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "docs = [\"\"\"Supervised learning is the machine learning task of learning a function that\n", + " maps an input to an output based on example input-output pairs. It infers a\n", + " function from labeled training data consisting of a set of training examples.\n", + " In supervised learning, each example is a pair consisting of an input object\n", + " (typically a vector) and a desired output value (also called the supervisory signal). \n", + " A supervised learning algorithm analyzes the training data and produces an inferred function, \n", + " which can be used for mapping new examples. An optimal scenario will allow for the \n", + " algorithm to correctly determine the class labels for unseen instances. This requires \n", + " the learning algorithm to generalize from the training data to unseen situations in a \n", + " 'reasonable' way (see inductive bias).\"\"\",\n", + " \"\"\"K-Nearest Neighbours is one of the most basic yet essential classification algorithms\n", + " in Machine Learning. It belongs to the supervised learning domain and finds intense application \n", + " in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life \n", + " scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about \n", + " the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian \n", + " distribution of the given data).\"\"\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['typically',\n", + " 'unseen',\n", + " 'supervised',\n", + " 'called',\n", + " 'the training data',\n", + " 'the machine',\n", + " 'vector',\n", + " 'maps',\n", + " 'bias',\n", + " 'a function',\n", + " 'mapping',\n", + " 'examples',\n", + " 'algorithm',\n", + " 'inductive',\n", + " 'learning',\n", + " 'machine learning',\n", + " 'scenario',\n", + " 'based',\n", + " 'training',\n", + " 'task',\n", + " 'reasonable',\n", + " 'Supervised learning',\n", + " 'pairs',\n", + " 'supervisory'],\n", + " ['meaning',\n", + " 'supervised',\n", + " 'opposed',\n", + " 'intrusion',\n", + " 'learning',\n", + " 'data',\n", + " 'K Nearest Neighbours',\n", + " 'parametric',\n", + " 'the most basic yet essential classification algorithms',\n", + " 'pattern recognition',\n", + " 'detection',\n", + " 'the supervised learning domain',\n", + " 'intense application',\n", + " 'underlying',\n", + " 'disposable',\n", + " 'Machine Learning']]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_keywords.get_keywords(docs, min_length=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -92,7 +175,7 @@ "'24098176703961584660'" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -103,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -112,7 +195,7 @@ "'24098176703961584661'" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -123,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -132,7 +215,7 @@ "'24098176695545132848'" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -143,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -152,7 +235,7 @@ "'10361698322165241623'" ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -163,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -172,7 +255,7 @@ "'12050206188136005184'" ] }, - "execution_count": 15, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -183,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -192,7 +275,7 @@ "'12062515252859527804'" ] }, - "execution_count": 16, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -228,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.10" }, "orig_nbformat": 4 }, From a9faf75dcc04262b135c27bd7fbd710f537b183a Mon Sep 17 00:00:00 2001 From: aamir Date: Mon, 21 Feb 2022 23:00:32 +0530 Subject: [PATCH 3/6] updated README.md --- README.md | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f94db29..99341c7 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ distinct_keywords.get_keywords(doc) ``` ## Output -['machine learning', +[['machine learning', 'pairs', 'mapping', 'vector', @@ -74,8 +74,67 @@ distinct_keywords.get_keywords(doc) 'task', 'algorithm', 'unseen', - 'training'] + 'training']] +``` +from distinct_keywords.keywords import DistinctKeywords + +#can also use it to extract keywords for multiple strings at once +docs = ["Supervised learning is the machine learning task of learning a function that + maps an input to an output based on example input-output pairs. It infers a + function from labeled training data consisting of a set of training examples. + In supervised learning, each example is a pair consisting of an input object + (typically a vector) and a desired output value (also called the supervisory signal). + A supervised learning algorithm analyzes the training data and produces an inferred function, + which can be used for mapping new examples. An optimal scenario will allow for the + algorithm to correctly determine the class labels for unseen instances. This requires + the learning algorithm to generalize from the training data to unseen situations in a + 'reasonable' way (see inductive bias).", + "K-Nearest Neighbours is one of the most basic yet essential classification algorithms in Machine Learning. It belongs to the supervised learning domain and finds intense application in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian distribution of the given data)."] +``` +## Output + +[['examples', + 'based', + 'supervised', + 'bias', + 'training', + 'maps', + 'a function', + 'the machine', + 'typically', + 'pairs', + 'reasonable', + 'vector', + 'called', + 'the training data', + 'scenario', + 'learning', + 'unseen', + 'Supervised learning', + 'task', + 'inductive', + 'mapping', + 'algorithm', + 'machine learning', + 'supervisory'], + ['opposed', + 'supervised', + 'disposable', + 'learning', + 'Machine Learning', + 'the most basic yet essential classification algorithms', + 'intrusion', + 'intense application', + 'the supervised learning domain', + 'detection', + 'pattern recognition', + 'parametric', + 'underlying', + 'meaning', + 'data', + 'K Nearest Neighbours']] + ## German Model (On test) I have added a german model based on word2vec found at https://devmount.github.io/GermanWordEmbeddings/ You can find it in the example folder From 91d8c88cecea6d1480e292ad23020b3787a26575 Mon Sep 17 00:00:00 2001 From: aamir Date: Mon, 21 Feb 2022 23:05:56 +0530 Subject: [PATCH 4/6] update -- added correct outputs to readme --- README.md | 69 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 99341c7..4205b89 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ distinct_keywords.get_keywords(doc) from distinct_keywords.keywords import DistinctKeywords #can also use it to extract keywords for multiple strings at once -docs = ["Supervised learning is the machine learning task of learning a function that +docs = ["""Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples. In supervised learning, each example is a pair consisting of an input object @@ -89,52 +89,51 @@ docs = ["Supervised learning is the machine learning task of learning a function which can be used for mapping new examples. An optimal scenario will allow for the algorithm to correctly determine the class labels for unseen instances. This requires the learning algorithm to generalize from the training data to unseen situations in a - 'reasonable' way (see inductive bias).", - "K-Nearest Neighbours is one of the most basic yet essential classification algorithms in Machine Learning. It belongs to the supervised learning domain and finds intense application in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian distribution of the given data)."] + 'reasonable' way (see inductive bias).""", + """K-Nearest Neighbours is one of the most basic yet essential classification algorithms + in Machine Learning. It belongs to the supervised learning domain and finds intense application + in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life + scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about + the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian + distribution of the given data)."""] + +distinct_keywords=DistinctKeywords() + +distinct_keywords.get_keywords(docs) ``` ## Output -[['examples', - 'based', +[['supervisory', + 'typically', + 'algorithm', 'supervised', - 'bias', - 'training', - 'maps', - 'a function', + 'unseen', + 'machine learning', + 'the training data', 'the machine', - 'typically', - 'pairs', - 'reasonable', 'vector', - 'called', - 'the training data', - 'scenario', - 'learning', - 'unseen', - 'Supervised learning', + 'training', 'task', - 'inductive', - 'mapping', - 'algorithm', - 'machine learning', - 'supervisory'], - ['opposed', + 'bias', + 'a function', + 'Supervised learning', + 'pairs', + 'mapping'], + ['meaning', 'supervised', - 'disposable', + 'opposed', + 'intrusion', 'learning', - 'Machine Learning', + 'K Nearest Neighbours', + 'parametric', 'the most basic yet essential classification algorithms', - 'intrusion', - 'intense application', - 'the supervised learning domain', - 'detection', 'pattern recognition', - 'parametric', + 'detection', + 'the supervised learning domain', + 'intense application', 'underlying', - 'meaning', - 'data', - 'K Nearest Neighbours']] - + 'Machine Learning']] + ## German Model (On test) I have added a german model based on word2vec found at https://devmount.github.io/GermanWordEmbeddings/ You can find it in the example folder From 69611074ab9526ec955626528898fc3d7b4f1381 Mon Sep 17 00:00:00 2001 From: aamir Date: Wed, 23 Feb 2022 01:37:42 +0530 Subject: [PATCH 5/6] update -- updated function name as per comment on PR #1 --- README.md | 6 +-- keyword_test.ipynb | 115 +++++++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 4205b89..85a32b7 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ distinct_keywords.get_keywords(doc) ``` ## Output -[['machine learning', +['machine learning', 'pairs', 'mapping', 'vector', @@ -74,7 +74,7 @@ distinct_keywords.get_keywords(doc) 'task', 'algorithm', 'unseen', - 'training']] + 'training'] ``` from distinct_keywords.keywords import DistinctKeywords @@ -99,7 +99,7 @@ docs = ["""Supervised learning is the machine learning task of learning a functi distinct_keywords=DistinctKeywords() -distinct_keywords.get_keywords(docs) +distinct_keywords.get_multiple_doc_keywords(docs) ``` ## Output diff --git a/keyword_test.ipynb b/keyword_test.ipynb index 35d03f8..af9360a 100644 --- a/keyword_test.ipynb +++ b/keyword_test.ipynb @@ -46,30 +46,30 @@ { "data": { "text/plain": [ - "[['typically',\n", - " 'unseen',\n", - " 'supervised',\n", - " 'called',\n", - " 'the machine',\n", - " 'vector',\n", - " 'maps',\n", - " 'bias',\n", - " 'a function',\n", - " 'mapping',\n", - " 'examples',\n", - " 'algorithm',\n", - " 'inductive',\n", - " 'learning',\n", - " 'machine learning',\n", - " 'scenario',\n", - " 'based',\n", - " 'training',\n", - " 'task',\n", - " 'reasonable',\n", - " ' Supervised learning',\n", - " 'the training data',\n", - " 'pairs',\n", - " 'supervisory']]" + "['bias',\n", + " 'inductive',\n", + " 'training',\n", + " 'vector',\n", + " 'maps',\n", + " 'the machine',\n", + " 'task',\n", + " 'supervisory',\n", + " 'reasonable',\n", + " 'scenario',\n", + " 'called',\n", + " 'mapping',\n", + " 'unseen',\n", + " ' Supervised learning',\n", + " 'supervised',\n", + " 'examples',\n", + " 'a function',\n", + " 'algorithm',\n", + " 'pairs',\n", + " 'machine learning',\n", + " 'typically',\n", + " 'learning',\n", + " 'based',\n", + " 'the training data']" ] }, "execution_count": 4, @@ -113,46 +113,46 @@ { "data": { "text/plain": [ - "[['typically',\n", - " 'unseen',\n", - " 'supervised',\n", - " 'called',\n", - " 'the training data',\n", - " 'the machine',\n", + "[['bias',\n", + " 'inductive',\n", + " 'training',\n", " 'vector',\n", " 'maps',\n", - " 'bias',\n", - " 'a function',\n", + " 'Supervised learning',\n", + " 'the machine',\n", + " 'task',\n", + " 'supervisory',\n", + " 'reasonable',\n", + " 'scenario',\n", + " 'called',\n", " 'mapping',\n", + " 'unseen',\n", + " 'supervised',\n", " 'examples',\n", + " 'a function',\n", " 'algorithm',\n", - " 'inductive',\n", - " 'learning',\n", - " 'machine learning',\n", - " 'scenario',\n", - " 'based',\n", - " 'training',\n", - " 'task',\n", - " 'reasonable',\n", - " 'Supervised learning',\n", " 'pairs',\n", - " 'supervisory'],\n", - " ['meaning',\n", - " 'supervised',\n", - " 'opposed',\n", - " 'intrusion',\n", + " 'machine learning',\n", + " 'typically',\n", " 'learning',\n", - " 'data',\n", - " 'K Nearest Neighbours',\n", - " 'parametric',\n", - " 'the most basic yet essential classification algorithms',\n", + " 'based',\n", + " 'the training data'],\n", + " ['intrusion',\n", + " 'intense application',\n", " 'pattern recognition',\n", " 'detection',\n", - " 'the supervised learning domain',\n", - " 'intense application',\n", " 'underlying',\n", + " 'data',\n", + " 'parametric',\n", " 'disposable',\n", - " 'Machine Learning']]" + " 'Machine Learning',\n", + " 'the supervised learning domain',\n", + " 'supervised',\n", + " 'K Nearest Neighbours',\n", + " 'the most basic yet essential classification algorithms',\n", + " 'opposed',\n", + " 'learning',\n", + " 'meaning']]" ] }, "execution_count": 6, @@ -161,7 +161,7 @@ } ], "source": [ - "distinct_keywords.get_keywords(docs, min_length=4)" + "distinct_keywords.get_multiple_doc_keywords(docs, min_length=4)" ] }, { @@ -283,13 +283,6 @@ "source": [ "distinct_keywords.hilbert_lookup_dictionary['deep_learning']" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 4d884643ecc87b076179670e25bbfd6ed0c57330 Mon Sep 17 00:00:00 2001 From: aamir Date: Wed, 23 Feb 2022 01:38:50 +0530 Subject: [PATCH 6/6] refactor -- modularized code into pieces in DistinctKeywords class to run separately for a single string and for a list of strings --- src/distinct_keywords/keywords.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/distinct_keywords/keywords.py b/src/distinct_keywords/keywords.py index 1bb0b87..2a4923a 100644 --- a/src/distinct_keywords/keywords.py +++ b/src/distinct_keywords/keywords.py @@ -1,3 +1,4 @@ +from ast import keyword import warnings warnings.filterwarnings('ignore') import joblib @@ -92,17 +93,29 @@ def get_keywords_from_text(self, return keywords def get_keywords(self, - input_documents, + input_document:str, min_length=2, include_proper_nouns=True, max_proper_noun_count=5): + + doc = self.nlp(input_document) + keywords = self.get_keywords_from_text(input_document=input_document, + doc=doc, + min_length=min_length, + include_proper_nouns=include_proper_nouns, + max_proper_noun_count=max_proper_noun_count) + + return keywords - if isinstance(input_documents, str): - input_documents = [input_documents] + def get_multiple_doc_keywords(self, + docs:list, + min_length=2, + include_proper_nouns=True, + max_proper_noun_count=5): keywords_for_all_input_samples = [] - for doc in self.nlp.pipe(input_documents): + for doc in self.nlp.pipe(docs): input_document = doc.text document_keywords = self.get_keywords_from_text(input_document=input_document,