text opts#
from sklearn.feature_extraction.text import CountVectorizer
text anal#
stop words#
WordNetLemmatizer#
sklearn CountVectorizer#
learns the corpus of words.
strings = [
... 'This is the first document.',
... 'This document is the second document.',
... 'And this is the third one.',
... 'Is this the first document?',
... ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()
print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")
strings len = 4
Features = ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
Features length = 9
X array =
[[0 1 1 1 0 0 1 0 1]
[0 2 0 1 0 1 1 0 1]
[1 0 0 1 1 0 1 1 1]
[0 1 1 1 0 0 1 0 1]]
X shape = (4, 9)
strings = [
... 'This is the first document.',
... 'This document is the second document.',
... 'And this is the third one.',
... 'Is this the first document?',
... ]
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()
print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")
strings len = 4
Features = ['and this' 'document is' 'first document' 'is the' 'is this'
'second document' 'the first' 'the second' 'the third' 'third one'
'this document' 'this is' 'this the']
Features length = 13
X array =
[[0 0 1 1 0 0 1 0 0 0 0 1 0]
[0 1 0 1 0 1 0 1 0 0 1 0 0]
[1 0 0 1 0 0 0 0 1 1 0 1 0]
[0 0 1 0 1 0 1 0 0 0 0 0 1]]
X shape = (4, 13)
strings = [
... 'This is the first document.',
... 'This document is the second document.',
... 'And this is the third one.',
... 'Is this the first document?',
... ]
# Minimum Frequency of words
minDf = 4
vectorizer = CountVectorizer(strip_accents = 'ascii' , min_df = minDf)
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()
print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")
strings len = 4
Features = ['is' 'the' 'this']
Features length = 3
X array =
[[1 1 1]
[1 1 1]
[1 1 1]
[1 1 1]]
X shape = (4, 3)
strings = [
... 'This is the first document.',
... 'This document is the second document.',
... 'And this is the third one.',
... 'Is this the first document?',
... ]
# List of words to avoid counting
lIgnoreWords = ['is']
vectorizer = CountVectorizer(strip_accents = 'ascii' , stop_words = lIgnoreWords)
X = vectorizer.fit_transform(strings)
features = vectorizer.get_feature_names_out()
print(f"strings len = {len(strings)}")
print(f"Features = {features}")
print(f"Features length = {len(features)}")
print(f"X array = \n {X.toarray()}")
print(f"X shape = {X.shape}")
strings len = 4
Features = ['and' 'document' 'first' 'one' 'second' 'the' 'third' 'this']
Features length = 8
X array =
[[0 1 1 0 0 1 0 1]
[0 2 0 0 1 1 0 1]
[1 0 0 1 0 1 1 1]
[0 1 1 0 0 1 0 1]]
X shape = (4, 8)