from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import CountVectorizer
import pandas

# Example of dataset

iris = load_iris()

featureMatrix = iris.data
labelVector = iris.target

print(featureMatrix.shape)
print(labelVector.shape)
print(iris.feature_names)
print(featureMatrix[1])

# ----------------------#
# Text training example #
# ----------------------#

print("---")

dataset = ["call you tonight", "Call me a cab", "please call me... PLEASE!"]
vector = CountVectorizer()

# Learn the "vocabulary" of the training data (occurs in-place)
vector.fit(dataset)
featureNames = vector.get_feature_names()

print(featureNames)

# Transform training data into a "document-term matrix'
documentTermMatrix = vector.transform(dataset)

# convert sparse matrix to a dense matrix
documentTermMatrix.toarray()

# examine the vocabulary and document-term matrix together
df = pandas.DataFrame(documentTermMatrix.toarray(), columns=vector.get_feature_names())
print(df.head())