Initial commt

master
Gregory Martin 2017-11-06 11:39:57 +01:00
commit 910b138668
11 changed files with 1194 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
Subject/
target/
.idea/

25
DMV.iml Normal file
View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_8">
<output url="file://$MODULE_DIR$/target/classes" />
<output-test url="file://$MODULE_DIR$/target/test-classes" />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/Subject" />
<excludeFolder url="file://$MODULE_DIR$/lib" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="SPMF" level="project" />
<orderEntry type="library" name="Maven: org.jetbrains:annotations:15.0" level="project" />
<orderEntry type="library" name="Maven: com.opencsv:opencsv:4.0" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-lang3:3.6" level="project" />
<orderEntry type="library" name="Maven: org.apache.commons:commons-text:1.1" level="project" />
<orderEntry type="library" name="Maven: commons-beanutils:commons-beanutils:1.9.3" level="project" />
<orderEntry type="library" name="Maven: commons-logging:commons-logging:1.2" level="project" />
<orderEntry type="library" name="Maven: commons-collections:commons-collections:3.2.2" level="project" />
</component>
</module>

10
README.md Normal file
View File

@ -0,0 +1,10 @@
# DMVHomework - Note to the teacher
Three data mining algorithms have been used for this homework: Apriori, LCM and BIDE+.
Those three algorithms have each a separate class which are located in the package *algorithm* with their respective name.
Each class has a main method which is able to launch either one instance of the selected algorithm or an experiment on this algorithm.
In order to give inputs to those algorithms, the class *main.DatasetConverter* is able to take the raw dataset and format it in the good format for Apriori and LCM (*.transaction*) and for BIDE+ (*.sequence*).
For BIDE+, since the raw dataset contains named items, the formatted *.sequence* will have only numbers in it. A file with the same name and an extension *.seqinfo* gives the association between the name of the item and its ID.
To explore the patterns returned by those algorithms, several methods in the class *main.DataExplorer* helps to choose meaningful patterns.

BIN
data.7z Normal file

Binary file not shown.

BIN
lib/SPMF.jar Normal file

Binary file not shown.

35
pom.xml Normal file
View File

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>fr.urao.dmv</groupId>
<artifactId>DMV Homework</artifactId>
<version>1</version>
<dependencies>
<dependency>
<groupId>org.jetbrains</groupId>
<artifactId>annotations</artifactId>
<version>RELEASE</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,92 @@
package algorithm;
import ca.pfv.spmf.algorithms.frequentpatterns.apriori.AlgoApriori;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import main.DataExplorer;
import java.io.*;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class Apriori {
public static void main(String[] args) throws InterruptedException {
System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME));
String experimentProduct = "Apriori-ProductID";
String experimentAisle = "Apriori-AisleID";
runApriori(DataExplorer.transactionsetAisle, DataExplorer.aprioriPatterns, 300d/131209);
runExperimentApriori(DataExplorer.transactionsetProduct, experimentProduct);
runExperimentApriori(DataExplorer.transactionsetAisle, experimentAisle);
System.out.println("Apriori Ended.");
}
private static void runApriori(String transactionPath, String patternPath, double minsup){
AlgoApriori apriori = new AlgoApriori();
try {
apriori.runAlgorithm(minsup,
transactionPath,
patternPath);
apriori.printStats();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void runExperimentApriori(String transactionPath, String experimentName){
List<Integer> listMinsup = new ArrayList<>();
int step = 1;
for(int minsup = 300; minsup < 1000; minsup += 50){
listMinsup.add(minsup);
}
Collections.shuffle(listMinsup);
try {
BufferedWriter writerTime = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + experimentName +"Time.perf"));
BufferedWriter writerCount = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + experimentName +"Count.perf"));
writerTime.write("minsup,time,algorithm");
writerTime.newLine();
writerCount.write("minsup,pattern_count,algorithm");
writerCount.newLine();
for(Integer minsup : listMinsup) {
AlgoApriori apriori = new AlgoApriori();
System.out.println("["+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME) +"]Step "+ step +"/"+ listMinsup.size() +" - Starting with minsup: "+ minsup);
long start = System.currentTimeMillis();
Itemsets result = apriori.runAlgorithm(minsup/131209d, transactionPath, null);
long end = System.currentTimeMillis();
List<Itemset> lvl = new ArrayList<>();
for(List<Itemset> level : result.getLevels()){
lvl.addAll(level);
}
writerCount.write(minsup +","+ lvl.size() +","+ experimentName);
writerCount.newLine();
writerTime.write(minsup +","+ (end - start)/1000 +","+ experimentName);
writerTime.newLine();
step++;
}
writerTime.close();
writerCount.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,32 @@
package algorithm;
import ca.pfv.spmf.algorithms.sequentialpatterns.prefixspan.AlgoBIDEPlus;
import main.DataExplorer;
import java.io.IOException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
public class BIDEPlus {
public static void main(String[] args) {
System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME));
runBIDEPlus(DataExplorer.transactionsetSequence, DataExplorer.bideplusPatterns);
}
private static void runBIDEPlus(String sequenceDataset, String patternOutput){
try {
AlgoBIDEPlus bideplus = new AlgoBIDEPlus();
double minsup = 400d/19999;
bideplus.setShowSequenceIdentifiers(false);
bideplus.runAlgorithm(sequenceDataset, minsup, patternOutput);
bideplus.printStatistics();
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,97 @@
package algorithm;
import ca.pfv.spmf.algorithms.frequentpatterns.lcm.AlgoLCM;
import ca.pfv.spmf.algorithms.frequentpatterns.lcm.Dataset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset;
import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets;
import main.DataExplorer;
import org.jetbrains.annotations.Nullable;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class LCM {
public static void main(String[] args) {
System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME));
String exprimentProduct = "LCM-ProductID";
runLCM(DataExplorer.transactionsetProduct, DataExplorer.lcmPatterns, 300d/131209);
runExperimentLCM(DataExplorer.transactionsetProduct, exprimentProduct);
System.out.println("LCM Ended.");
}
private static void runLCM(String datasetPath, @Nullable String output, double minsup){
AlgoLCM lcm = new AlgoLCM();
try {
Dataset dataset = new Dataset(datasetPath);
lcm.runAlgorithm(minsup, dataset, output);
lcm.printStats();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void runExperimentLCM(String transactionPath, String experimentName){
List<Integer> listMinsup = new ArrayList<>();
int step = 1;
for(int minsup = 50; minsup < 1000; minsup += 50){
listMinsup.add(minsup);
}
Collections.shuffle(listMinsup);
try {
BufferedWriter writerTime = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "performanceLCMTime.csv"));
BufferedWriter writerCount = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "performanceLCMCount.csv"));
writerTime.write("minsup,time,algorithm");
writerTime.newLine();
writerCount.write("minsup,pattern_count,algorithm");
writerCount.newLine();
for(Integer minsup : listMinsup) {
AlgoLCM lcm = new AlgoLCM();
// if true in next line it will find only closed itemsets, otherwise, all frequent itemsets
System.out.println("["+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME) +"]Step "+ step +"/"+ listMinsup.size() +" - Starting with minsup: "+ minsup);
long start = System.currentTimeMillis();
Dataset dataset = new Dataset(transactionPath);
Itemsets itemsets = lcm.runAlgorithm(minsup/131209d, dataset, null);
long end = System.currentTimeMillis();
List<Itemset> lvl = new ArrayList<>();
for(List<Itemset> level : itemsets.getLevels()){
lvl.addAll(level);
}
writerCount.write(minsup +","+ lvl.size() +",LCM-ProductId");
writerCount.newLine();
writerTime.write(minsup +","+ (end - start)/1000 +","+ experimentName);
writerTime.newLine();
step++;
}
writerTime.close();
writerCount.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

View File

@ -0,0 +1,616 @@
package main;
import com.opencsv.CSVReader;
import java.io.*;
import java.util.*;
public class DataExplorer {
public static String workingDirectory = "/home/toshuumilia/tmp/testDMV/";
public static String rawdatasetProduct = workingDirectory + "order_products__train.csv";
public static String rawSequenceDataset = workingDirectory + "transactions_seq.txt";
public static String transactionsetProduct = workingDirectory + "trainProduct.transaction";
public static String transactionsetAisle = workingDirectory + "trainAisle.transaction";
public static String transactionsetSequence = workingDirectory + "customer.sequence";
public static String productInformation = workingDirectory + "products.csv";
public static String aisleInformation = workingDirectory + "aisles.csv";
public static String sequenceInformation = workingDirectory + "customer.seqinfo";
public static String lcmPatterns = workingDirectory + "pattern.lcm";
public static String aprioriPatterns = workingDirectory + "pattern.apriori";
public static String bideplusPatterns = workingDirectory + "pattern.bideplus";
public static void main(String[] args) {
Set<Integer> itemsItemsets = new HashSet<>();
Set<Integer> itemsSupport = new HashSet<>();
Set<Integer> itemsSequence = new HashSet<>();
Set<Integer> antecedents = new HashSet<>();
Set<Integer> consequents = new HashSet<>();
Set<Integer> aisles = new HashSet<>();
Set<Integer> itemExclusions = new HashSet<>();
// Create the dataset in the transaction format.
DatasetConverter.convertCSVIntoTransaction(rawdatasetProduct, transactionsetProduct);
DatasetConverter.sortTransaction(transactionsetProduct);
// Find the mamximum support in the product transaction set.
checkMaxSupport(transactionsetProduct);
// Replace the product id with its aisle id.
DatasetConverter.replaceIdByAisle(productInformation, rawdatasetProduct, transactionsetAisle);
DatasetConverter.sortTransaction(transactionsetAisle);
// Find the maximum support in the aisle transaction set
checkMaxSupport(transactionsetAisle);
// Tell how much items in the *product* transaction set has a support lower than 1% of the highest one.
separateItemOccurrence(workingDirectory +"itemOccurrence.csv", getItemOccurrence(transactionsetProduct), 187.27);
// Tell how much items in the *aisle* transaction set has a support lower than 1% of the highest one.
separateItemOccurrence(workingDirectory +"itemOccurrence.csv", getItemOccurrence(transactionsetAisle), 721.28);
// Find 100 itemests with the highest support and with at least 2 items in each itemsets.
findMaxSupportItemsets(lcmPatterns, 100, 1);
// Find the itemsets with the items I want.
itemsItemsets.addAll(Arrays.asList(24,83,120,123));
chooseItemset(lcmPatterns, itemsItemsets);
// Compute the support of an itemset
itemsSupport.addAll(Arrays.asList(24, 83));
computeSupport(itemsSupport, transactionsetAisle);
// Compute the confidence of an association rule
antecedents.addAll(Arrays.asList(24, 83, 120));
consequents.addAll(Arrays.asList(123));
computeConfidence(antecedents, consequents, transactionsetAisle);
// Get the name of some aisles
aisles.addAll(Arrays.asList(24, 83));
findNameAisle(aisles, aisleInformation);
// Create the sequence dataset
DatasetConverter.convertCSVIntoSequences(rawSequenceDataset, transactionsetSequence, sequenceInformation);
DatasetConverter.sortSequences(transactionsetSequence);
//Find the 10 most supported sequences containing at least 3 itemsets with at least one having 2 items, and exclude all sequences having the provided items.
itemExclusions.addAll(Arrays.asList(93, 474, 6, 66));
findMaxSupportSequence(bideplusPatterns, 10, 2, 2, itemExclusions);
itemsSequence.addAll(Arrays.asList(75, 251));
findNameProductSeq(itemsSequence, sequenceInformation);
}
public static Map<Integer, Integer> getItemOccurrence(String transactionPath){
Map<Integer, Integer> mapItemOccurrence = new HashMap<>();
if(transactionPath.contains(".transaction")){
try{
BufferedReader reader = new BufferedReader(new FileReader(transactionPath));
String line;
int numberTransaction = 0;
while ((line = reader.readLine()) != null){
String[] lineSplit = line.split(" ");
for(String split : lineSplit){
try {
Integer item = Integer.valueOf(split);
Integer numberOccurrence = mapItemOccurrence.getOrDefault(item, 0) + 1;
mapItemOccurrence.put(item, numberOccurrence);
} catch (NumberFormatException e){
System.err.println("NumberFormatException");
}
}
numberTransaction++;
}
} catch (IOException e) {
e.printStackTrace();
}
}
return mapItemOccurrence;
}
public static void checkMaxSupport(String transactionPath){
if(transactionPath.contains(".transaction")){
try{
BufferedReader reader = new BufferedReader(new FileReader(transactionPath));
Map<Integer, Integer> mapItemOccurrence = new HashMap<>();
String line;
int numberTransaction = 0;
int maxSupport = 0;
int idMaxSupport = -1;
while ((line = reader.readLine()) != null){
String[] lineSplit = line.split(" ");
for(String split : lineSplit){
try {
Integer item = Integer.valueOf(split);
Integer numberOccurrence = mapItemOccurrence.getOrDefault(item, 0) + 1;
mapItemOccurrence.put(item, numberOccurrence);
if (maxSupport < numberOccurrence) {
maxSupport = numberOccurrence;
idMaxSupport = item;
}
} catch (NumberFormatException e){
System.err.println("NumberFormatException");
}
}
numberTransaction++;
}
System.out.println("Number of Transactions: "+ numberTransaction);
System.out.println("Most present item: "+ idMaxSupport + " ("+ maxSupport +" items)");
System.out.println("Max relative support for Apriori: "+ ((double) maxSupport)/numberTransaction);
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
} else {
throw new RuntimeException();
}
}
public static void separateItemOccurrence(String output, Map<Integer, Integer> mapItemOccurrence, double threshold){
int[] category = new int[2];
double separator = 721.78;
for(Integer itemId : mapItemOccurrence.keySet()){
int index = mapItemOccurrence.get(itemId) < threshold ? 0 : 1;
category[index]++;
}
try{
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
writer.write("interval,occurrence");
writer.newLine();
writer.write("[0.00;"+ threshold +"[,"+ category[0]);
writer.newLine();
writer.write("["+ threshold +";"+ (int) (threshold*100) +"],"+ category[1]);
writer.newLine();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void chooseItemset(String patternPath, Set<Integer> itemsNeeded){
try{
BufferedReader reader = new BufferedReader(new FileReader(patternPath));
StringBuilder stringBuilder = new StringBuilder("");
String line;
while((line = reader.readLine()) != null){
String[] linePart = line.split(" #SUP: ");
String[] items = linePart[0].split(" ");
Set<Integer> recognizedItems = new HashSet<>();
for(String item : items){
Integer item_id = Integer.valueOf(item);
if(itemsNeeded.contains(item_id)){
recognizedItems.add(item_id);
}
}
if(recognizedItems.size() == itemsNeeded.size()){
stringBuilder.append(line);
stringBuilder.append('\n');
}
}
System.out.println(stringBuilder.toString());
reader.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void computeSupport(Set<Integer> itemset, String dataset){
try{
BufferedReader reader = new BufferedReader(new FileReader(dataset));
String line;
int support = 0;
while ((line = reader.readLine()) != null){
String[] lineParts = line.split(" ");
Set<Integer> recognizedItems = new HashSet<>();
for(String item : lineParts){
Integer item_id = Integer.valueOf(item);
if(itemset.contains(item_id)){
recognizedItems.add(item_id);
}
}
if(recognizedItems.size() == itemset.size()){
support++;
}
}
System.out.println(prettyprintItemset(itemset) + ": Supp="+ support);
reader.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void computeConfidence(Set<Integer> antecedents, Set<Integer> consequents, String dataset){
Set<Integer> antUcon = new HashSet<>();
double antUconSupport = 0;
double antSupport = 0;
antUcon.addAll(antecedents);
antUcon.addAll(consequents);
try{
BufferedReader reader = new BufferedReader(new FileReader(dataset));
String line;
while((line = reader.readLine()) != null){
String[] items = line.split(" ");
Set<Integer> recognizedAntItems = new HashSet<>();
Set<Integer> recognizedAntUConItems = new HashSet<>();
for(String item : items){
Integer item_id = Integer.valueOf(item);
if(antecedents.contains(item_id)){
recognizedAntItems.add(item_id);
}
if(antUcon.contains(item_id)){
recognizedAntUConItems.add(item_id);
}
}
if(recognizedAntItems.size() == antecedents.size()){
antSupport++;
}
if(recognizedAntUConItems.size() == antUcon.size()){
antUconSupport++;
}
}
// Pretty print
StringBuilder stringBuilderAnt = new StringBuilder("");
for(Integer item_id : antecedents){
stringBuilderAnt.append(item_id);
stringBuilderAnt.append(',');
}
stringBuilderAnt.deleteCharAt(stringBuilderAnt.lastIndexOf(","));
// Pretty print
StringBuilder stringBuilderCon = new StringBuilder("");
for(Integer item_id : consequents){
stringBuilderCon.append(item_id);
stringBuilderCon.append(',');
}
stringBuilderCon.deleteCharAt(stringBuilderCon.lastIndexOf(","));
System.out.println("{"+ stringBuilderAnt +"} -> {"+ stringBuilderCon +"}: Conf="+ antUconSupport / antSupport);
System.out.println("Antecedents support: "+ antSupport);
System.out.println("Antecedents U Consequents support: "+ antUconSupport);
reader.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void findMaxSupportItemsets(String patternPath, int n, int excludePatternSizeLessThan){
try{
BufferedReader reader = new BufferedReader(new FileReader(patternPath));
String line;
Map<String, Integer> mapISSupport = new HashMap<>();
while((line = reader.readLine()) != null){
String[] lineParts = line.split(" #SUP: ");
String[] items = lineParts[0].split(" ");
if(items.length > excludePatternSizeLessThan) {
Integer support = Integer.valueOf(lineParts[1]);
SortedSet<Integer> itemset = new TreeSet<>();
for (String item_id : items) {
itemset.add(Integer.valueOf(item_id));
}
mapISSupport.put(prettyprintItemset(itemset), support);
}
}
reader.close();
System.out.println(mapISSupport.keySet().size() +" interesting patterns found.");
List<String> nMaxItemset = findMaxSupportMap(new HashMap<>(mapISSupport), n);
for(String itemsetStr : nMaxItemset) {
System.out.println(itemsetStr);
}
} catch (IOException e){
e.printStackTrace();
}
}
public static void findMaxSupportSequence(String patternPath, int n, int excludeSequenceSizeLessThan, int sizeOneItemsetAtLeast, Set<Integer> excludeItems){
try{
BufferedReader reader = new BufferedReader(new FileReader(patternPath));
String line;
Map<String, Integer> mapSSupport = new HashMap<>();
while((line = reader.readLine()) != null){
String[] lineParts = line.split(" #SUP: ");
String[] itemsets = lineParts[0].split("-1");
if(itemsets.length > excludeSequenceSizeLessThan) {
Integer support = Integer.valueOf(lineParts[1]);
boolean itemExcluded = false;
boolean itemsetSizeExcluded = true;
for(String itemset : itemsets){
String[] items = itemset.split(" ");
int nbItems = 0;
for(String item : items){
if(!item.equals("")){
int itemID = Integer.valueOf(item);
for(Integer excludedItem : excludeItems) {
if (excludedItem.equals(itemID)){
itemExcluded = true;
break;
}
}
nbItems++;
}
}
if(nbItems >= sizeOneItemsetAtLeast){
itemsetSizeExcluded = false;
}
}
if(!itemExcluded && !itemsetSizeExcluded) {
mapSSupport.put(prettyprintSequence(lineParts[0]), support);
}
}
}
reader.close();
System.out.println(mapSSupport.keySet().size() +" interesting patterns found.");
List<String> nMaxItemset = findMaxSupportMap(new HashMap<>(mapSSupport), n);
for(String itemsetStr : nMaxItemset) {
System.out.println(itemsetStr);
}
} catch (IOException e){
e.printStackTrace();
}
}
public static List<String> findMaxSupportMap(Map<String, Integer> mapSupport, int n){
List<String> nMaxItemset = new ArrayList<>();
for(int i = 0; i < n; i++){
String itemsetMax = "";
Integer supportMax = -1;
for(String itemset : mapSupport.keySet()){
Integer support = mapSupport.get(itemset);
if(supportMax < support){
itemsetMax = itemset;
supportMax = support;
}
}
if(!itemsetMax.equals("")) {
nMaxItemset.add(itemsetMax + "-Supp=" + supportMax);
mapSupport.remove(itemsetMax);
}
}
return nMaxItemset;
}
public static void findNameAisle(Set<Integer> aislesNeeded, String aisleInformations){
try {
StringBuilder stringBuilder = new StringBuilder("{*");
CSVReader csvReader = new CSVReader(new FileReader(aisleInformations));
String[] lineParts;
csvReader.readNext(); // Trash attributes name line
while((lineParts = csvReader.readNext()) != null){
if(aislesNeeded.contains(Integer.valueOf(lineParts[0]))){
stringBuilder.append(lineParts[1]);
stringBuilder.append(", ");
}
}
stringBuilder.delete(stringBuilder.lastIndexOf(","), stringBuilder.lastIndexOf(",")+2);
stringBuilder.append("*}");
System.out.println(stringBuilder);
csvReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void findNameProductSeq(Set<Integer> productsNeeded, String sequenceInformationPath){
try{
BufferedReader reader = new BufferedReader(new FileReader(sequenceInformationPath));
String line;
while((line = reader.readLine()) != null){
String[] lineParts = line.split(",");
Integer itemID = Integer.valueOf(lineParts[1]);
// if((lineParts[0] +","+ lineParts[1]).contains(",93")){
// System.out.println((lineParts[0] +","+ lineParts[1]));
// }
if(productsNeeded.contains(itemID)){
System.out.println(lineParts[1] +"=>"+ lineParts[0]);
}
}
reader.close();
} catch (IOException e){
e.printStackTrace();
}
}
private static String prettyprintItemset(Set<Integer> itemset){
StringBuilder stringBuilder = new StringBuilder("{");
for(Integer item_id : itemset){
stringBuilder.append(item_id);
stringBuilder.append(',');
}
stringBuilder.deleteCharAt(stringBuilder.lastIndexOf(","));
stringBuilder.append('}');
return stringBuilder.toString();
}
private static String prettyprintSequence(String rawSequence){
String[] itemsets = rawSequence.split("-1");
List<Set<Integer>> sequence = new ArrayList<>();
for(String rawitemset : itemsets){
String[] items = rawitemset.split(" ");
Set<Integer> itemset = new HashSet<>();
for(String item : items){
if(!item.equals("")){
itemset.add(Integer.valueOf(item));
}
}
sequence.add(itemset);
}
return prettyprintSequence(sequence);
}
private static String prettyprintSequence(List<Set<Integer>> sequence){
StringBuilder stringBuilder = new StringBuilder("[");
for(Set<Integer> itemset : sequence){
stringBuilder.append(prettyprintItemset(itemset));
stringBuilder.append(" ");
}
stringBuilder.deleteCharAt(stringBuilder.lastIndexOf(" "));
stringBuilder.append("]");
return stringBuilder.toString();
}
// public static void countNumberTransactionWithMoreThanNItem(String transactionPath, int n){
// try{
// BufferedReader reader = new BufferedReader(new FileReader(transactionPath));
// String line;
// int sum = 0;
//
// while((line = reader.readLine()) != null){
// sum += line.split(" ").length >= n ? 1 : 0;
// }
//
// System.out.println("Number of transactions with at least "+ n +" items: "+ sum);
//
// reader.close();
// } catch (IOException e){
// e.printStackTrace();
// }
// }
// public static void countNumberItemInTransaction(File input, File output){
// if(input.getAbsolutePath().contains(".transaction")) {
// try {
// BufferedReader reader = new BufferedReader(new FileReader(input));
// BufferedWriter writer = new BufferedWriter(new FileWriter(output));
// String line;
//
// writer.write("length");
// writer.newLine();
//
// while ((line = reader.readLine()) != null) {
// String[] lineSplit = line.split(" ");
// writer.write(lineSplit.length +"");
// writer.newLine();
// }
//
// writer.close();
// reader.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// } else {
// throw new RuntimeException("");
// }
// }
// public static void groupItemOccurrence(String output, Map<Integer, Integer> mapItemOccurrence){
// Map<String, Integer> mapIntervalCount = new HashMap<>();
// List<String> listInterval = new ArrayList<>();
//
// DecimalFormat nFormat = new DecimalFormat("#.##");
// DecimalFormatSymbols dfs = new DecimalFormatSymbols();
// dfs.setDecimalSeparator('.');
// nFormat.setDecimalFormatSymbols(dfs);
//
//
// int nbInterval = 100;
// double max = 72178d +1;
//
// for(int i = 0; i < nbInterval; i++){
// String interval = "["+ nFormat.format((max/nbInterval)*i) +";"+ nFormat.format((max/nbInterval)*(i+1)) +"[";
// listInterval.add(interval);
// mapIntervalCount.put(interval, 0);
// }
//
// for(Integer itemId : mapItemOccurrence.keySet()){
// String interval = listInterval.get((int) (mapItemOccurrence.get(itemId)/(max/nbInterval)));
//
// mapIntervalCount.put(interval, mapIntervalCount.getOrDefault(interval, 0)+1);
// }
//
// try{
// BufferedWriter writer = new BufferedWriter(new FileWriter(output));
//
// writer.write("interval,occurrence");
// writer.newLine();
//
// for(String key : listInterval){
// if(!mapIntervalCount.get(key).equals(0)) {
// writer.write(key + "," + mapIntervalCount.get(key));
// writer.newLine();
// }
// }
//
// writer.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
}

View File

@ -0,0 +1,284 @@
package main;
import com.opencsv.CSVReader;
import java.io.*;
import java.util.*;
public class DatasetConverter {
public static void convertCSVIntoTransaction(String input, String output){
try{
BufferedReader reader = new BufferedReader(new FileReader(input));
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
String line;
int idOrder;
int idProduct;
int lastIdOrder = -1;
boolean firstTransaction = true;
// Delete first line with the header.
reader.readLine();
while ((line = reader.readLine()) != null){
String[] lines = line.split(",");
idOrder = Integer.valueOf(lines[0]);
idProduct = Integer.valueOf(lines[1]);
if(lastIdOrder != idOrder){
if(firstTransaction){
firstTransaction = false;
} else {
writer.write("\n");
}
lastIdOrder = idOrder;
}
writer.write(idProduct + " ");
}
reader.close();
writer.close();
} catch (IOException exception){
exception.printStackTrace();
}
}
public static void replaceIdByAisle(String product, String dataset, String output){
Map<Integer, Integer> mapProductIdAisleId = new HashMap<>();
try{
Set<Integer> setAisleKept = new HashSet<>();
BufferedReader reader;// = new BufferedReader(new FileReader(product));
BufferedWriter writer;// = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "replace.tmp"));
String line;
int idOrder;
int previousIdOrder = -1;
int idProduct;
boolean firstTransaction = true;
CSVReader csvreader = new CSVReader(new FileReader(product));
String[] nline;
csvreader.readNext();
while ((nline = csvreader.readNext()) != null) {
try {
mapProductIdAisleId.put(Integer.valueOf(nline[0]), Integer.valueOf(nline[2]));
} catch (NumberFormatException e){
for(String l : nline) {
System.out.print(l);
if(!l.equals(nline[nline.length-1])) {
System.out.print(",");
}
}
System.out.println();
throw e;
}
}
reader = new BufferedReader(new FileReader(dataset));
writer = new BufferedWriter(new FileWriter(output));
reader.readLine();
while ((line = reader.readLine()) != null){
String[] lines = line.split(",");
idOrder = Integer.valueOf(lines[0]);
idProduct = Integer.valueOf(lines[1]);
if(previousIdOrder != idOrder){
if(firstTransaction){
firstTransaction = false;
} else {
if(!setAisleKept.isEmpty()) {
for (Integer idAisle : setAisleKept) {
writer.write(idAisle.toString() + " ");
}
writer.write("\n");
}
setAisleKept.clear();
}
previousIdOrder = idOrder;
}
setAisleKept.add(mapProductIdAisleId.get(idProduct));
}
// For the last transaction.
if(!setAisleKept.isEmpty()) {
for (Integer idAisle : setAisleKept) {
writer.write(idAisle.toString() + " ");
}
writer.write("\n");
}
reader.close();
writer.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void sortTransaction(String transactionPath){
try{
BufferedReader reader = new BufferedReader(new FileReader(transactionPath));
StringBuilder stringBuilder = new StringBuilder("");
String line;
while ((line = reader.readLine()) != null){
String[] linePart = line.split(" ");
SortedSet<Integer> sortedSet = new TreeSet<>();
for(String item : linePart){
sortedSet.add(Integer.valueOf(item));
}
for(Integer item_id : sortedSet){
stringBuilder.append(item_id);
stringBuilder.append(' ');
}
stringBuilder.append('\n');
}
reader.close();
BufferedWriter writer = new BufferedWriter(new FileWriter(transactionPath));
writer.write(stringBuilder.toString());
writer.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void convertCSVIntoSequences(String rawSequenceDataset, String outputSequence, String outputInfo){
try{
BufferedReader reader = new BufferedReader(new FileReader(rawSequenceDataset));
BufferedWriter writerSequence = new BufferedWriter(new FileWriter(outputSequence));
BufferedWriter writerInfo = new BufferedWriter(new FileWriter(outputInfo));
String line;
StringBuilder sequenceBuilder = new StringBuilder("");
int lineNumber = 0;
Map<String, Integer> mapNameId = new HashMap<>();
int freeID = 1;
int lastCustomerID = -1;
int lastOrderNumber = -1;
boolean firstSequence = true;
while ((line = reader.readLine()) != null){
String[] lineParts = line.split("\t");
Integer customerID = Integer.valueOf(lineParts[0]);
Integer orderNumber = Integer.valueOf(lineParts[1]);
if(!customerID.equals(lastCustomerID)){
if(firstSequence){
firstSequence = false;
}else{
sequenceBuilder.append("-2");
writerSequence.write(sequenceBuilder.toString());
writerSequence.newLine();
sequenceBuilder = new StringBuilder("");
}
lastCustomerID = customerID;
} else if(orderNumber <= lastOrderNumber){
System.out.println("Line "+ lineNumber +" :c");
}
String[] items = lineParts[3].split(",");
for(String item : items){
if(item.equals("")){
break;
}
Integer itemID = mapNameId.getOrDefault(item, -1);
if(itemID.equals(-1)){
mapNameId.put(item, freeID);
itemID = freeID;
freeID++;
}
sequenceBuilder.append(itemID);
sequenceBuilder.append(' ');
}
sequenceBuilder.append("-1 ");
lastOrderNumber = orderNumber;
lineNumber++;
}
for(String item : new TreeSet<>(mapNameId.keySet())){
writerInfo.write(item);
writerInfo.write(',');
writerInfo.write(mapNameId.get(item).toString());
writerInfo.newLine();
}
reader.close();
writerSequence.close();
writerInfo.close();
} catch (IOException e){
e.printStackTrace();
}
}
public static void sortSequences(String sequenceDataset){
try{
BufferedReader reader = new BufferedReader(new FileReader(sequenceDataset));
StringBuilder stringBuilder = new StringBuilder("");
String line;
while ((line = reader.readLine()) != null){
String[] linePart = line.split(" -1 ");
for(String itemset : linePart) {
if(!itemset.contains("-2")) {
SortedSet<Integer> sortedSet = new TreeSet<>();
String[] items = itemset.split(" ");
for (String item : items) {
sortedSet.add(Integer.valueOf(item));
}
for(Integer item_id : sortedSet){
stringBuilder.append(item_id);
stringBuilder.append(' ');
}
}
stringBuilder.append("-1 ");
}
stringBuilder.delete(stringBuilder.lastIndexOf("-1"), stringBuilder.lastIndexOf("-1") +3);
stringBuilder.append("-2\n");
}
reader.close();
BufferedWriter writer = new BufferedWriter(new FileWriter(sequenceDataset));
writer.write(stringBuilder.toString());
writer.close();
} catch (IOException e){
e.printStackTrace();
}
}
}