41 lines
998 B
Python
41 lines
998 B
Python
from sklearn.datasets import load_iris
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
import pandas
|
|
|
|
# Example of dataset
|
|
|
|
iris = load_iris()
|
|
|
|
featureMatrix = iris.data
|
|
labelVector = iris.target
|
|
|
|
print(featureMatrix.shape)
|
|
print(labelVector.shape)
|
|
print(iris.feature_names)
|
|
print(featureMatrix[1])
|
|
|
|
# ----------------------#
|
|
# Text training example #
|
|
# ----------------------#
|
|
|
|
print("---")
|
|
|
|
dataset = ["call you tonight", "Call me a cab", "please call me... PLEASE!"]
|
|
vector = CountVectorizer()
|
|
|
|
# Learn the "vocabulary" of the training data (occurs in-place)
|
|
vector.fit(dataset)
|
|
featureNames = vector.get_feature_names()
|
|
|
|
print(featureNames)
|
|
|
|
# Transform training data into a "document-term matrix'
|
|
documentTermMatrix = vector.transform(dataset)
|
|
|
|
# convert sparse matrix to a dense matrix
|
|
documentTermMatrix.toarray()
|
|
|
|
# examine the vocabulary and document-term matrix together
|
|
df = pandas.DataFrame(documentTermMatrix.toarray(), columns=vector.get_feature_names())
|
|
print(df.head())
|