Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,64 @@ distinct_keywords.get_keywords(doc)
'unseen',
'training']

```
from distinct_keywords.keywords import DistinctKeywords

#can also use it to extract keywords for multiple strings at once
docs = ["""Supervised learning is the machine learning task of learning a function that
maps an input to an output based on example input-output pairs. It infers a
function from labeled training data consisting of a set of training examples.
In supervised learning, each example is a pair consisting of an input object
(typically a vector) and a desired output value (also called the supervisory signal).
A supervised learning algorithm analyzes the training data and produces an inferred function,
which can be used for mapping new examples. An optimal scenario will allow for the
algorithm to correctly determine the class labels for unseen instances. This requires
the learning algorithm to generalize from the training data to unseen situations in a
'reasonable' way (see inductive bias).""",
"""K-Nearest Neighbours is one of the most basic yet essential classification algorithms
in Machine Learning. It belongs to the supervised learning domain and finds intense application
in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life
scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about
the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian
distribution of the given data)."""]

distinct_keywords=DistinctKeywords()

distinct_keywords.get_multiple_doc_keywords(docs)
```
## Output

[['supervisory',
'typically',
'algorithm',
'supervised',
'unseen',
'machine learning',
'the training data',
'the machine',
'vector',
'training',
'task',
'bias',
'a function',
'Supervised learning',
'pairs',
'mapping'],
['meaning',
'supervised',
'opposed',
'intrusion',
'learning',
'K Nearest Neighbours',
'parametric',
'the most basic yet essential classification algorithms',
'pattern recognition',
'detection',
'the supervised learning domain',
'intense application',
'underlying',
'Machine Learning']]

## German Model (On test)
I have added a german model based on word2vec found at https://devmount.github.io/GermanWordEmbeddings/
You can find it in the example folder
152 changes: 114 additions & 38 deletions keyword_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,39 +40,39 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['mapping',\n",
" 'learning',\n",
" 'a function',\n",
" 'scenario',\n",
" 'pairs',\n",
" 'reasonable',\n",
" 'the machine',\n",
"['bias',\n",
" 'inductive',\n",
" 'training',\n",
" 'vector',\n",
" 'maps',\n",
" 'the machine',\n",
" 'task',\n",
" 'inductive',\n",
" 'typically',\n",
" 'called',\n",
" 'bias',\n",
" 'examples',\n",
" 'supervisory',\n",
" 'maps',\n",
" 'training',\n",
" 'reasonable',\n",
" 'scenario',\n",
" 'called',\n",
" 'mapping',\n",
" 'unseen',\n",
" ' Supervised learning',\n",
" 'machine learning',\n",
" 'supervised',\n",
" 'examples',\n",
" 'a function',\n",
" 'algorithm',\n",
" 'pairs',\n",
" 'machine learning',\n",
" 'typically',\n",
" 'learning',\n",
" 'based',\n",
" 'supervised',\n",
" 'the training data']"
]
},
"execution_count": 9,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -83,7 +83,90 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"docs = [\"\"\"Supervised learning is the machine learning task of learning a function that\n",
" maps an input to an output based on example input-output pairs. It infers a\n",
" function from labeled training data consisting of a set of training examples.\n",
" In supervised learning, each example is a pair consisting of an input object\n",
" (typically a vector) and a desired output value (also called the supervisory signal). \n",
" A supervised learning algorithm analyzes the training data and produces an inferred function, \n",
" which can be used for mapping new examples. An optimal scenario will allow for the \n",
" algorithm to correctly determine the class labels for unseen instances. This requires \n",
" the learning algorithm to generalize from the training data to unseen situations in a \n",
" 'reasonable' way (see inductive bias).\"\"\",\n",
" \"\"\"K-Nearest Neighbours is one of the most basic yet essential classification algorithms\n",
" in Machine Learning. It belongs to the supervised learning domain and finds intense application \n",
" in pattern recognition, data mining and intrusion detection. It is widely disposable in real-life \n",
" scenarios since it is non-parametric, meaning, it does not make any underlying assumptions about \n",
" the distribution of data (as opposed to other algorithms such as GMM, which assume a Gaussian \n",
" distribution of the given data).\"\"\"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['bias',\n",
" 'inductive',\n",
" 'training',\n",
" 'vector',\n",
" 'maps',\n",
" 'Supervised learning',\n",
" 'the machine',\n",
" 'task',\n",
" 'supervisory',\n",
" 'reasonable',\n",
" 'scenario',\n",
" 'called',\n",
" 'mapping',\n",
" 'unseen',\n",
" 'supervised',\n",
" 'examples',\n",
" 'a function',\n",
" 'algorithm',\n",
" 'pairs',\n",
" 'machine learning',\n",
" 'typically',\n",
" 'learning',\n",
" 'based',\n",
" 'the training data'],\n",
" ['intrusion',\n",
" 'intense application',\n",
" 'pattern recognition',\n",
" 'detection',\n",
" 'underlying',\n",
" 'data',\n",
" 'parametric',\n",
" 'disposable',\n",
" 'Machine Learning',\n",
" 'the supervised learning domain',\n",
" 'supervised',\n",
" 'K Nearest Neighbours',\n",
" 'the most basic yet essential classification algorithms',\n",
" 'opposed',\n",
" 'learning',\n",
" 'meaning']]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_keywords.get_multiple_doc_keywords(docs, min_length=4)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -92,7 +175,7 @@
"'24098176703961584660'"
]
},
"execution_count": 10,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -103,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -112,7 +195,7 @@
"'24098176703961584661'"
]
},
"execution_count": 11,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -123,7 +206,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -132,7 +215,7 @@
"'24098176695545132848'"
]
},
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -143,7 +226,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -152,7 +235,7 @@
"'10361698322165241623'"
]
},
"execution_count": 14,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -163,7 +246,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -172,7 +255,7 @@
"'12050206188136005184'"
]
},
"execution_count": 15,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -183,7 +266,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 12,
"metadata": {},
"outputs": [
{
Expand All @@ -192,21 +275,14 @@
"'12062515252859527804'"
]
},
"execution_count": 16,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"distinct_keywords.hilbert_lookup_dictionary['deep_learning']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -228,7 +304,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
"version": "3.8.10"
},
"orig_nbformat": 4
},
Expand Down
Loading