text opts

Contents

text opts#

from sklearn.feature_extraction.text import CountVectorizer

text anal#

https://www.nltk.org/

stop words#

https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

WordNetLemmatizer#

sklearn CountVectorizer#

learns the corpus of words.

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html -

strings = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()

print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")

strings len = 4
Features = ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Features length = 9
X array = 
 [[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
X shape = (4, 9)

strings = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()

print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")

strings len = 4
Features = ['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
Features length = 13
X array = 
 [[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]
X shape = (4, 13)

strings = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
# Minimum Frequency of words
minDf = 4
vectorizer = CountVectorizer(strip_accents = 'ascii' ,  min_df = minDf)
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()

print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")

strings len = 4
Features = ['is' 'the' 'this']
Features length = 3
X array = 
 [[1 1 1]
 [1 1 1]
 [1 1 1]
 [1 1 1]]
X shape = (4, 3)

strings = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]

# List of words to avoid counting
lIgnoreWords = ['is']

vectorizer = CountVectorizer(strip_accents = 'ascii' , stop_words = lIgnoreWords)
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()

print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")

strings len = 4
Features = ['and' 'document' 'first' 'one' 'second' 'the' 'third' 'this']
Features length = 8
X array = 
 [[0 1 1 0 0 1 0 1]
 [0 2 0 0 1 1 0 1]
 [1 0 0 1 0 1 1 1]
 [0 1 1 0 0 1 0 1]]
X shape = (4, 8)