SMLHomework/Experiments/learningmethod/experimentDT.py

295 lines
11 KiB
Python

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pandas
from pandas import DataFrame
import numpy
import os
workspace = "/home/toshuumilia/Workspace/SML/" # Insert the working directory here.
datasetPath = workspace + "data/sms.tsv" # Tells where is located the data
smsCount = 5574
if not os.path.exists(workspace + "results/"):
os.makedirs(workspace + "results/")
###################
# Loading dataset #
###################
smsDF = pandas.read_table(datasetPath, header=None, names=["label", "message"])
smsDF["label_numerical"] = smsDF.label.map({"ham": 0, "spam": 1})
smsDataset = smsDF.message
smsLabel = smsDF.label_numerical
methodArray = []
measureArray = []
valueArray = []
availableMeasures = ["Accuracy", "F1Score"]
dataset_train, dataset_test, label_train, label_test = train_test_split(smsDataset, smsLabel, random_state=1)
# Note: DTM=documentTermMatrix
vectorizer = CountVectorizer()
trainDTM = vectorizer.fit_transform(dataset_train)
testDTM = vectorizer.transform(dataset_test)
# DEPTH EXPERIMENT
# availableDepths = [None, 50, 25, 10, 5, 3]
#
# print("Depth Experiment")
# for x in range(0, 4):
# for depth in availableDepths:
# print("Step", x, "for depth:", depth)
# # SEE: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# decisionTree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=depth,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features=None, random_state=None, max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None,
# presort=False)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# # SEE: https://en.wikipedia.org/wiki/Precision_and_recall
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append("Depth-" + str(depth))
#
# # Save the experiments
# experimentDTDepthDF = DataFrame()
# experimentDTDepthDF["Measure"] = measureArray
# experimentDTDepthDF["Value"] = valueArray
# experimentDTDepthDF["Depth"] = methodArray
#
# experimentDTDepthDF.to_csv(workspace + "results/experimentDTDepth.csv")
# CRITERION EXPERIMENT
# availableCriterion = ["gini", "entropy"]
#
# methodArray = []
# measureArray = []
# valueArray = []
#
# print("Criteron Experiment")
# for x in range(0, 4):
# for criterion in availableCriterion:
# print("Step", x, "for criterion:", criterion)
# decisionTree = DecisionTreeClassifier(criterion=criterion, splitter='best', max_depth=None,
# min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
# max_features=None, random_state=None, max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None,
# presort=False)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append("Criterion-" + criterion)
#
# # Save the experiments
# experimentDTCriteronDF = DataFrame()
# experimentDTCriteronDF["Measure"] = measureArray
# experimentDTCriteronDF["Value"] = valueArray
# experimentDTCriteronDF["Criterion"] = methodArray
#
# experimentDTCriteronDF.to_csv(workspace + "results/experimentDTCriterion.csv")
# MIN_SAMPLES_SPLIT EXPERIMENT
# availableMinSampleSplit = [2, 10, 25, 50, 100, 250]
#
# methodArray = []
# measureArray = []
# valueArray = []
#
# print("MinSampleSplit Experiment")
# for x in range(0, 20):
# for minSampleSplit in availableMinSampleSplit:
# print("Step", x, "for minSampleSplit:", minSampleSplit)
# decisionTree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None,
# min_samples_split=minSampleSplit, min_samples_leaf=1,
# min_weight_fraction_leaf=0.0,
# max_features=None, random_state=None, max_leaf_nodes=None,
# min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None,
# presort=False)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append("MinSampleSplit-" + str(minSampleSplit))
#
# # Save the experiments
# experimentDTMinSampleSplitDF = DataFrame()
# experimentDTMinSampleSplitDF["Measure"] = measureArray
# experimentDTMinSampleSplitDF["Value"] = valueArray
# experimentDTMinSampleSplitDF["MinSampleSplit"] = methodArray
#
# experimentDTMinSampleSplitDF.to_csv(workspace + "results/experimentDTMinSampleSplit.csv")
# MAX_FEATURE EXPERIMENT
# availableMaxFeature = [None, "sqrt", "log2", 0.25, 0.5, 0.75]
#
# methodArray = []
# measureArray = []
# valueArray = []
#
# print("MaxFeature Experiment")
# for x in range(0, 10):
# for maxFeature in availableMaxFeature:
# print("Step", x, "for MaxFeature:", maxFeature)
# decisionTree = DecisionTreeClassifier(max_features=maxFeature)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append("MaxFeature-" + str(maxFeature))
#
# # Save the experiments
# experimentDTMaxFeatureDF = DataFrame()
# experimentDTMaxFeatureDF["Measure"] = measureArray
# experimentDTMaxFeatureDF["Value"] = valueArray
# experimentDTMaxFeatureDF["MaxFeature"] = methodArray
#
# experimentDTMaxFeatureDF.to_csv(workspace + "results/experimentDTMaxFeature.csv")
# MAX_LEAF_NODES EXPERIMENT
# availableMaxLeafNodes = []
# for ratio in numpy.arange(1/6, 1.01, 1/6):
# availableMaxLeafNodes.append(int(ratio * smsCount))
# availableMaxLeafNodes = numpy.concatenate([[2], numpy.arange(10, 270, 10)])
# methodArray = []
# measureArray = []
# valueArray = []
#
# print("MaxLeafNodes Experiment")
# for x in range(0, 5):
# for maxLeafNodes in availableMaxLeafNodes:
# print("Step", x, "for MaxLeafNodes:", maxLeafNodes)
# decisionTree = DecisionTreeClassifier(max_leaf_nodes=maxLeafNodes)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append(maxLeafNodes)
#
# # Save the experiments
# experimentDTMaxLeafNodesDF = DataFrame()
# experimentDTMaxLeafNodesDF["Measure"] = measureArray
# experimentDTMaxLeafNodesDF["Value"] = valueArray
# experimentDTMaxLeafNodesDF["MaxLeafNodes"] = methodArray
#
# experimentDTMaxLeafNodesDF.to_csv(workspace + "results/experimentDTMaxLeafNodes.csv")
# MIN_IMPURITY_DECREASE
# availableMinImpurityDecrease = numpy.arange(0., 0.061, 0.005)
#
# methodArray = []
# measureArray = []
# valueArray = []
#
# print("MaxFeature Experiment")
# for x in range(0, 10):
# for minImpurityDecrease in availableMinImpurityDecrease:
# print("Step", x, "for MinImpurityDecrease:", minImpurityDecrease)
# decisionTree = DecisionTreeClassifier(min_impurity_decrease=minImpurityDecrease)
# decisionTree.fit(trainDTM, label_train)
#
# label_predicted = decisionTree.predict(testDTM)
#
# valueArray.append(metrics.accuracy_score(label_test, label_predicted))
# valueArray.append(metrics.f1_score(label_test, label_predicted))
#
# for index in range(0, 2):
# measureArray.append(availableMeasures[index])
# methodArray.append(str(minImpurityDecrease*100) + "%")
#
# # Save the experiments
# experimentDTMinImpurityDecreaseDF = DataFrame()
# experimentDTMinImpurityDecreaseDF["Measure"] = measureArray
# experimentDTMinImpurityDecreaseDF["Value"] = valueArray
# experimentDTMinImpurityDecreaseDF["MinImpurityDecrease"] = methodArray
#
# experimentDTMinImpurityDecreaseDF.to_csv(workspace + "results/experimentDTMinImpurityDecrease.csv")
# DEFAULT DT VS OPTIMIZED DT EXPERIMENT
availableMeasures = ["Precision", "Recall", "Accuracy", "F1Score"]
methodArray = []
measureArray = []
valueArray = []
print("MaxFeature Experiment")
for x in range(0, 20):
print("Step", x, "for Basic Decision Tree")
decisionTree = DecisionTreeClassifier()
decisionTree.fit(trainDTM, label_train)
label_predicted = decisionTree.predict(testDTM)
valueArray.append(metrics.precision_score(label_test, label_predicted))
valueArray.append(metrics.recall_score(label_test, label_predicted))
valueArray.append(metrics.accuracy_score(label_test, label_predicted))
valueArray.append(metrics.f1_score(label_test, label_predicted))
for measure in availableMeasures:
measureArray.append(measure)
methodArray.append("Basic Decision Tree")
print("Step", x, "for Custom Decision Tree")
decisionTree = DecisionTreeClassifier(max_features=0.25, criterion="gini")
decisionTree.fit(trainDTM, label_train)
label_predicted = decisionTree.predict(testDTM)
valueArray.append(metrics.precision_score(label_test, label_predicted))
valueArray.append(metrics.recall_score(label_test, label_predicted))
valueArray.append(metrics.accuracy_score(label_test, label_predicted))
valueArray.append(metrics.f1_score(label_test, label_predicted))
for measure in availableMeasures:
measureArray.append(measure)
methodArray.append("Custom Decision Tree")
# Save the experiments
experimentDTBasicVsOptimizedDF = DataFrame()
experimentDTBasicVsOptimizedDF["Measure"] = measureArray
experimentDTBasicVsOptimizedDF["Value"] = valueArray
experimentDTBasicVsOptimizedDF["Tuning"] = methodArray
experimentDTBasicVsOptimizedDF.to_csv(workspace + "results/experimentDTBasicVsOptimized.csv")