commit 910b138668a34d5a4862b924a99d25a8c698c2ad Author: Gregory Martin Date: Mon Nov 6 11:39:57 2017 +0100 Initial commt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6d3a330 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Subject/ +target/ +.idea/ diff --git a/DMV.iml b/DMV.iml new file mode 100644 index 0000000..6a66e9f --- /dev/null +++ b/DMV.iml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1e0cb81 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# DMVHomework - Note to the teacher + +Three data mining algorithms have been used for this homework: Apriori, LCM and BIDE+. +Those three algorithms have each a separate class which are located in the package *algorithm* with their respective name. +Each class has a main method which is able to launch either one instance of the selected algorithm or an experiment on this algorithm. + +In order to give inputs to those algorithms, the class *main.DatasetConverter* is able to take the raw dataset and format it in the good format for Apriori and LCM (*.transaction*) and for BIDE+ (*.sequence*). +For BIDE+, since the raw dataset contains named items, the formatted *.sequence* will have only numbers in it. A file with the same name and an extension *.seqinfo* gives the association between the name of the item and its ID. + +To explore the patterns returned by those algorithms, several methods in the class *main.DataExplorer* helps to choose meaningful patterns. diff --git a/data.7z b/data.7z new file mode 100644 index 0000000..9858877 Binary files /dev/null and b/data.7z differ diff --git a/lib/SPMF.jar b/lib/SPMF.jar new file mode 100644 index 0000000..9f0892b Binary files /dev/null and b/lib/SPMF.jar differ diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..3c5ceac --- /dev/null +++ b/pom.xml @@ -0,0 +1,35 @@ + + + 4.0.0 + + fr.urao.dmv + DMV Homework + 1 + + + + org.jetbrains + annotations + RELEASE + + + com.opencsv + opencsv + 4.0 + + + + + + + maven-compiler-plugin + + 1.8 + 1.8 + + + + + \ No newline at end of file diff --git a/src/main/java/algorithm/Apriori.java b/src/main/java/algorithm/Apriori.java new file mode 100644 index 0000000..5e47c56 --- /dev/null +++ b/src/main/java/algorithm/Apriori.java @@ -0,0 +1,92 @@ +package algorithm; + +import ca.pfv.spmf.algorithms.frequentpatterns.apriori.AlgoApriori; +import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; +import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets; +import main.DataExplorer; + +import java.io.*; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class Apriori { + + public static void main(String[] args) throws InterruptedException { + System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME)); + String experimentProduct = "Apriori-ProductID"; + String experimentAisle = "Apriori-AisleID"; + + runApriori(DataExplorer.transactionsetAisle, DataExplorer.aprioriPatterns, 300d/131209); + + runExperimentApriori(DataExplorer.transactionsetProduct, experimentProduct); + runExperimentApriori(DataExplorer.transactionsetAisle, experimentAisle); + + System.out.println("Apriori Ended."); + } + + private static void runApriori(String transactionPath, String patternPath, double minsup){ + + AlgoApriori apriori = new AlgoApriori(); + try { + apriori.runAlgorithm(minsup, + transactionPath, + patternPath); + apriori.printStats(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void runExperimentApriori(String transactionPath, String experimentName){ + List listMinsup = new ArrayList<>(); + int step = 1; + + for(int minsup = 300; minsup < 1000; minsup += 50){ + listMinsup.add(minsup); + } + + Collections.shuffle(listMinsup); + + try { + BufferedWriter writerTime = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + experimentName +"Time.perf")); + BufferedWriter writerCount = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + experimentName +"Count.perf")); + + writerTime.write("minsup,time,algorithm"); + writerTime.newLine(); + + writerCount.write("minsup,pattern_count,algorithm"); + writerCount.newLine(); + + for(Integer minsup : listMinsup) { + AlgoApriori apriori = new AlgoApriori(); + + System.out.println("["+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME) +"]Step "+ step +"/"+ listMinsup.size() +" - Starting with minsup: "+ minsup); + + long start = System.currentTimeMillis(); + Itemsets result = apriori.runAlgorithm(minsup/131209d, transactionPath, null); + long end = System.currentTimeMillis(); + + List lvl = new ArrayList<>(); + for(List level : result.getLevels()){ + lvl.addAll(level); + } + + writerCount.write(minsup +","+ lvl.size() +","+ experimentName); + writerCount.newLine(); + + writerTime.write(minsup +","+ (end - start)/1000 +","+ experimentName); + writerTime.newLine(); + + step++; + } + + writerTime.close(); + writerCount.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/algorithm/BIDEPlus.java b/src/main/java/algorithm/BIDEPlus.java new file mode 100644 index 0000000..778a3ea --- /dev/null +++ b/src/main/java/algorithm/BIDEPlus.java @@ -0,0 +1,32 @@ +package algorithm; + +import ca.pfv.spmf.algorithms.sequentialpatterns.prefixspan.AlgoBIDEPlus; +import main.DataExplorer; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; + +public class BIDEPlus { + + public static void main(String[] args) { + System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME)); + + runBIDEPlus(DataExplorer.transactionsetSequence, DataExplorer.bideplusPatterns); + } + + private static void runBIDEPlus(String sequenceDataset, String patternOutput){ + try { + AlgoBIDEPlus bideplus = new AlgoBIDEPlus(); + double minsup = 400d/19999; + + bideplus.setShowSequenceIdentifiers(false); + bideplus.runAlgorithm(sequenceDataset, minsup, patternOutput); + + bideplus.printStatistics(); + } catch (IOException e) { + e.printStackTrace(); + } + + } +} diff --git a/src/main/java/algorithm/LCM.java b/src/main/java/algorithm/LCM.java new file mode 100644 index 0000000..c6ad458 --- /dev/null +++ b/src/main/java/algorithm/LCM.java @@ -0,0 +1,97 @@ +package algorithm; + +import ca.pfv.spmf.algorithms.frequentpatterns.lcm.AlgoLCM; +import ca.pfv.spmf.algorithms.frequentpatterns.lcm.Dataset; +import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemset; +import ca.pfv.spmf.patterns.itemset_array_integers_with_count.Itemsets; +import main.DataExplorer; +import org.jetbrains.annotations.Nullable; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class LCM { + + public static void main(String[] args) { + System.out.println("Start Time: "+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME)); + String exprimentProduct = "LCM-ProductID"; + + runLCM(DataExplorer.transactionsetProduct, DataExplorer.lcmPatterns, 300d/131209); + + runExperimentLCM(DataExplorer.transactionsetProduct, exprimentProduct); + + System.out.println("LCM Ended."); + } + + private static void runLCM(String datasetPath, @Nullable String output, double minsup){ + + AlgoLCM lcm = new AlgoLCM(); + try { + Dataset dataset = new Dataset(datasetPath); + lcm.runAlgorithm(minsup, dataset, output); + lcm.printStats(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + private static void runExperimentLCM(String transactionPath, String experimentName){ + List listMinsup = new ArrayList<>(); + int step = 1; + + for(int minsup = 50; minsup < 1000; minsup += 50){ + listMinsup.add(minsup); + } + + Collections.shuffle(listMinsup); + + try { + BufferedWriter writerTime = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "performanceLCMTime.csv")); + BufferedWriter writerCount = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "performanceLCMCount.csv")); + + writerTime.write("minsup,time,algorithm"); + writerTime.newLine(); + + writerCount.write("minsup,pattern_count,algorithm"); + writerCount.newLine(); + + for(Integer minsup : listMinsup) { + AlgoLCM lcm = new AlgoLCM(); + // if true in next line it will find only closed itemsets, otherwise, all frequent itemsets + + System.out.println("["+ LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_TIME) +"]Step "+ step +"/"+ listMinsup.size() +" - Starting with minsup: "+ minsup); + long start = System.currentTimeMillis(); + Dataset dataset = new Dataset(transactionPath); + Itemsets itemsets = lcm.runAlgorithm(minsup/131209d, dataset, null); + long end = System.currentTimeMillis(); + + + List lvl = new ArrayList<>(); + for(List level : itemsets.getLevels()){ + lvl.addAll(level); + } + + writerCount.write(minsup +","+ lvl.size() +",LCM-ProductId"); + writerCount.newLine(); + + writerTime.write(minsup +","+ (end - start)/1000 +","+ experimentName); + writerTime.newLine(); + + step++; + } + + writerTime.close(); + writerCount.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + } +} diff --git a/src/main/java/main/DataExplorer.java b/src/main/java/main/DataExplorer.java new file mode 100644 index 0000000..c74a82b --- /dev/null +++ b/src/main/java/main/DataExplorer.java @@ -0,0 +1,616 @@ +package main; + +import com.opencsv.CSVReader; + +import java.io.*; +import java.util.*; + +public class DataExplorer { + + public static String workingDirectory = "/home/toshuumilia/tmp/testDMV/"; + public static String rawdatasetProduct = workingDirectory + "order_products__train.csv"; + public static String rawSequenceDataset = workingDirectory + "transactions_seq.txt"; + + public static String transactionsetProduct = workingDirectory + "trainProduct.transaction"; + public static String transactionsetAisle = workingDirectory + "trainAisle.transaction"; + public static String transactionsetSequence = workingDirectory + "customer.sequence"; + + public static String productInformation = workingDirectory + "products.csv"; + public static String aisleInformation = workingDirectory + "aisles.csv"; + public static String sequenceInformation = workingDirectory + "customer.seqinfo"; + + public static String lcmPatterns = workingDirectory + "pattern.lcm"; + public static String aprioriPatterns = workingDirectory + "pattern.apriori"; + public static String bideplusPatterns = workingDirectory + "pattern.bideplus"; + + + public static void main(String[] args) { + Set itemsItemsets = new HashSet<>(); + Set itemsSupport = new HashSet<>(); + Set itemsSequence = new HashSet<>(); + Set antecedents = new HashSet<>(); + Set consequents = new HashSet<>(); + Set aisles = new HashSet<>(); + Set itemExclusions = new HashSet<>(); + + // Create the dataset in the transaction format. + DatasetConverter.convertCSVIntoTransaction(rawdatasetProduct, transactionsetProduct); + DatasetConverter.sortTransaction(transactionsetProduct); + + // Find the mamximum support in the product transaction set. + checkMaxSupport(transactionsetProduct); + + // Replace the product id with its aisle id. + DatasetConverter.replaceIdByAisle(productInformation, rawdatasetProduct, transactionsetAisle); + DatasetConverter.sortTransaction(transactionsetAisle); + + // Find the maximum support in the aisle transaction set + checkMaxSupport(transactionsetAisle); + + // Tell how much items in the *product* transaction set has a support lower than 1% of the highest one. + separateItemOccurrence(workingDirectory +"itemOccurrence.csv", getItemOccurrence(transactionsetProduct), 187.27); + // Tell how much items in the *aisle* transaction set has a support lower than 1% of the highest one. + separateItemOccurrence(workingDirectory +"itemOccurrence.csv", getItemOccurrence(transactionsetAisle), 721.28); + + + // Find 100 itemests with the highest support and with at least 2 items in each itemsets. + findMaxSupportItemsets(lcmPatterns, 100, 1); + + // Find the itemsets with the items I want. + itemsItemsets.addAll(Arrays.asList(24,83,120,123)); + chooseItemset(lcmPatterns, itemsItemsets); + + // Compute the support of an itemset + itemsSupport.addAll(Arrays.asList(24, 83)); + computeSupport(itemsSupport, transactionsetAisle); + + // Compute the confidence of an association rule + antecedents.addAll(Arrays.asList(24, 83, 120)); + consequents.addAll(Arrays.asList(123)); + computeConfidence(antecedents, consequents, transactionsetAisle); + + // Get the name of some aisles + aisles.addAll(Arrays.asList(24, 83)); + findNameAisle(aisles, aisleInformation); + + // Create the sequence dataset + DatasetConverter.convertCSVIntoSequences(rawSequenceDataset, transactionsetSequence, sequenceInformation); + DatasetConverter.sortSequences(transactionsetSequence); + + //Find the 10 most supported sequences containing at least 3 itemsets with at least one having 2 items, and exclude all sequences having the provided items. + itemExclusions.addAll(Arrays.asList(93, 474, 6, 66)); + findMaxSupportSequence(bideplusPatterns, 10, 2, 2, itemExclusions); + + itemsSequence.addAll(Arrays.asList(75, 251)); + findNameProductSeq(itemsSequence, sequenceInformation); + } + + public static Map getItemOccurrence(String transactionPath){ + Map mapItemOccurrence = new HashMap<>(); + + if(transactionPath.contains(".transaction")){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(transactionPath)); + String line; + int numberTransaction = 0; + + while ((line = reader.readLine()) != null){ + String[] lineSplit = line.split(" "); + + for(String split : lineSplit){ + try { + Integer item = Integer.valueOf(split); + + Integer numberOccurrence = mapItemOccurrence.getOrDefault(item, 0) + 1; + + mapItemOccurrence.put(item, numberOccurrence); + } catch (NumberFormatException e){ + System.err.println("NumberFormatException"); + } + } + + numberTransaction++; + } + } catch (IOException e) { + e.printStackTrace(); + } + } + return mapItemOccurrence; + } + + public static void checkMaxSupport(String transactionPath){ + if(transactionPath.contains(".transaction")){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(transactionPath)); + Map mapItemOccurrence = new HashMap<>(); + String line; + int numberTransaction = 0; + int maxSupport = 0; + int idMaxSupport = -1; + + while ((line = reader.readLine()) != null){ + String[] lineSplit = line.split(" "); + + for(String split : lineSplit){ + try { + Integer item = Integer.valueOf(split); + + Integer numberOccurrence = mapItemOccurrence.getOrDefault(item, 0) + 1; + + mapItemOccurrence.put(item, numberOccurrence); + if (maxSupport < numberOccurrence) { + maxSupport = numberOccurrence; + idMaxSupport = item; + } + } catch (NumberFormatException e){ + System.err.println("NumberFormatException"); + } + } + + numberTransaction++; + } + + System.out.println("Number of Transactions: "+ numberTransaction); + System.out.println("Most present item: "+ idMaxSupport + " ("+ maxSupport +" items)"); + System.out.println("Max relative support for Apriori: "+ ((double) maxSupport)/numberTransaction); + + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + throw new RuntimeException(); + } + } + + public static void separateItemOccurrence(String output, Map mapItemOccurrence, double threshold){ + int[] category = new int[2]; + double separator = 721.78; + + for(Integer itemId : mapItemOccurrence.keySet()){ + int index = mapItemOccurrence.get(itemId) < threshold ? 0 : 1; + category[index]++; + } + + try{ + BufferedWriter writer = new BufferedWriter(new FileWriter(output)); + + writer.write("interval,occurrence"); + writer.newLine(); + + writer.write("[0.00;"+ threshold +"[,"+ category[0]); + writer.newLine(); + + writer.write("["+ threshold +";"+ (int) (threshold*100) +"],"+ category[1]); + writer.newLine(); + + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void chooseItemset(String patternPath, Set itemsNeeded){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(patternPath)); + StringBuilder stringBuilder = new StringBuilder(""); + String line; + + while((line = reader.readLine()) != null){ + String[] linePart = line.split(" #SUP: "); + String[] items = linePart[0].split(" "); + Set recognizedItems = new HashSet<>(); + + for(String item : items){ + Integer item_id = Integer.valueOf(item); + if(itemsNeeded.contains(item_id)){ + recognizedItems.add(item_id); + } + } + + if(recognizedItems.size() == itemsNeeded.size()){ + stringBuilder.append(line); + stringBuilder.append('\n'); + } + } + + System.out.println(stringBuilder.toString()); + + reader.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void computeSupport(Set itemset, String dataset){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(dataset)); + String line; + int support = 0; + + while ((line = reader.readLine()) != null){ + String[] lineParts = line.split(" "); + Set recognizedItems = new HashSet<>(); + + for(String item : lineParts){ + Integer item_id = Integer.valueOf(item); + if(itemset.contains(item_id)){ + recognizedItems.add(item_id); + } + } + + if(recognizedItems.size() == itemset.size()){ + support++; + } + } + + System.out.println(prettyprintItemset(itemset) + ": Supp="+ support); + + reader.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void computeConfidence(Set antecedents, Set consequents, String dataset){ + Set antUcon = new HashSet<>(); + double antUconSupport = 0; + double antSupport = 0; + + antUcon.addAll(antecedents); + antUcon.addAll(consequents); + + try{ + BufferedReader reader = new BufferedReader(new FileReader(dataset)); + String line; + + while((line = reader.readLine()) != null){ + String[] items = line.split(" "); + Set recognizedAntItems = new HashSet<>(); + Set recognizedAntUConItems = new HashSet<>(); + + for(String item : items){ + Integer item_id = Integer.valueOf(item); + + if(antecedents.contains(item_id)){ + recognizedAntItems.add(item_id); + } + if(antUcon.contains(item_id)){ + recognizedAntUConItems.add(item_id); + } + } + + if(recognizedAntItems.size() == antecedents.size()){ + antSupport++; + } + if(recognizedAntUConItems.size() == antUcon.size()){ + antUconSupport++; + } + } + + // Pretty print + StringBuilder stringBuilderAnt = new StringBuilder(""); + for(Integer item_id : antecedents){ + stringBuilderAnt.append(item_id); + stringBuilderAnt.append(','); + } + stringBuilderAnt.deleteCharAt(stringBuilderAnt.lastIndexOf(",")); + + // Pretty print + StringBuilder stringBuilderCon = new StringBuilder(""); + for(Integer item_id : consequents){ + stringBuilderCon.append(item_id); + stringBuilderCon.append(','); + } + stringBuilderCon.deleteCharAt(stringBuilderCon.lastIndexOf(",")); + + System.out.println("{"+ stringBuilderAnt +"} -> {"+ stringBuilderCon +"}: Conf="+ antUconSupport / antSupport); + System.out.println("Antecedents support: "+ antSupport); + System.out.println("Antecedents U Consequents support: "+ antUconSupport); + + reader.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void findMaxSupportItemsets(String patternPath, int n, int excludePatternSizeLessThan){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(patternPath)); + String line; + + Map mapISSupport = new HashMap<>(); + + while((line = reader.readLine()) != null){ + String[] lineParts = line.split(" #SUP: "); + String[] items = lineParts[0].split(" "); + + if(items.length > excludePatternSizeLessThan) { + Integer support = Integer.valueOf(lineParts[1]); + SortedSet itemset = new TreeSet<>(); + + for (String item_id : items) { + itemset.add(Integer.valueOf(item_id)); + } + + mapISSupport.put(prettyprintItemset(itemset), support); + } + } + reader.close(); + + System.out.println(mapISSupport.keySet().size() +" interesting patterns found."); + List nMaxItemset = findMaxSupportMap(new HashMap<>(mapISSupport), n); + + for(String itemsetStr : nMaxItemset) { + System.out.println(itemsetStr); + } + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void findMaxSupportSequence(String patternPath, int n, int excludeSequenceSizeLessThan, int sizeOneItemsetAtLeast, Set excludeItems){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(patternPath)); + String line; + + Map mapSSupport = new HashMap<>(); + + while((line = reader.readLine()) != null){ + String[] lineParts = line.split(" #SUP: "); + String[] itemsets = lineParts[0].split("-1"); + + if(itemsets.length > excludeSequenceSizeLessThan) { + Integer support = Integer.valueOf(lineParts[1]); + + boolean itemExcluded = false; + boolean itemsetSizeExcluded = true; + + for(String itemset : itemsets){ + String[] items = itemset.split(" "); + int nbItems = 0; + + for(String item : items){ + if(!item.equals("")){ + int itemID = Integer.valueOf(item); + + for(Integer excludedItem : excludeItems) { + if (excludedItem.equals(itemID)){ + itemExcluded = true; + break; + } + } + + nbItems++; + } + } + + if(nbItems >= sizeOneItemsetAtLeast){ + itemsetSizeExcluded = false; + } + } + + if(!itemExcluded && !itemsetSizeExcluded) { + mapSSupport.put(prettyprintSequence(lineParts[0]), support); + } + } + } + reader.close(); + + System.out.println(mapSSupport.keySet().size() +" interesting patterns found."); + + List nMaxItemset = findMaxSupportMap(new HashMap<>(mapSSupport), n); + + for(String itemsetStr : nMaxItemset) { + System.out.println(itemsetStr); + } + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static List findMaxSupportMap(Map mapSupport, int n){ + List nMaxItemset = new ArrayList<>(); + + for(int i = 0; i < n; i++){ + String itemsetMax = ""; + Integer supportMax = -1; + + for(String itemset : mapSupport.keySet()){ + Integer support = mapSupport.get(itemset); + if(supportMax < support){ + itemsetMax = itemset; + supportMax = support; + } + } + + if(!itemsetMax.equals("")) { + nMaxItemset.add(itemsetMax + "-Supp=" + supportMax); + mapSupport.remove(itemsetMax); + } + } + + return nMaxItemset; + } + + public static void findNameAisle(Set aislesNeeded, String aisleInformations){ + try { + StringBuilder stringBuilder = new StringBuilder("{*"); + CSVReader csvReader = new CSVReader(new FileReader(aisleInformations)); + String[] lineParts; + + csvReader.readNext(); // Trash attributes name line + while((lineParts = csvReader.readNext()) != null){ + if(aislesNeeded.contains(Integer.valueOf(lineParts[0]))){ + stringBuilder.append(lineParts[1]); + stringBuilder.append(", "); + } + } + stringBuilder.delete(stringBuilder.lastIndexOf(","), stringBuilder.lastIndexOf(",")+2); + stringBuilder.append("*}"); + + System.out.println(stringBuilder); + + csvReader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void findNameProductSeq(Set productsNeeded, String sequenceInformationPath){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(sequenceInformationPath)); + String line; + + while((line = reader.readLine()) != null){ + String[] lineParts = line.split(","); + Integer itemID = Integer.valueOf(lineParts[1]); + +// if((lineParts[0] +","+ lineParts[1]).contains(",93")){ +// System.out.println((lineParts[0] +","+ lineParts[1])); +// } + + if(productsNeeded.contains(itemID)){ + System.out.println(lineParts[1] +"=>"+ lineParts[0]); + } + } + + reader.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + private static String prettyprintItemset(Set itemset){ + StringBuilder stringBuilder = new StringBuilder("{"); + + for(Integer item_id : itemset){ + stringBuilder.append(item_id); + stringBuilder.append(','); + } + stringBuilder.deleteCharAt(stringBuilder.lastIndexOf(",")); + stringBuilder.append('}'); + + return stringBuilder.toString(); + } + + private static String prettyprintSequence(String rawSequence){ + String[] itemsets = rawSequence.split("-1"); + List> sequence = new ArrayList<>(); + + for(String rawitemset : itemsets){ + String[] items = rawitemset.split(" "); + Set itemset = new HashSet<>(); + + for(String item : items){ + if(!item.equals("")){ + itemset.add(Integer.valueOf(item)); + } + } + + sequence.add(itemset); + } + + return prettyprintSequence(sequence); + } + + private static String prettyprintSequence(List> sequence){ + StringBuilder stringBuilder = new StringBuilder("["); + + for(Set itemset : sequence){ + stringBuilder.append(prettyprintItemset(itemset)); + stringBuilder.append(" "); + } + stringBuilder.deleteCharAt(stringBuilder.lastIndexOf(" ")); + + stringBuilder.append("]"); + return stringBuilder.toString(); + } + + // public static void countNumberTransactionWithMoreThanNItem(String transactionPath, int n){ +// try{ +// BufferedReader reader = new BufferedReader(new FileReader(transactionPath)); +// String line; +// int sum = 0; +// +// while((line = reader.readLine()) != null){ +// sum += line.split(" ").length >= n ? 1 : 0; +// } +// +// System.out.println("Number of transactions with at least "+ n +" items: "+ sum); +// +// reader.close(); +// } catch (IOException e){ +// e.printStackTrace(); +// } +// } + // public static void countNumberItemInTransaction(File input, File output){ +// if(input.getAbsolutePath().contains(".transaction")) { +// try { +// BufferedReader reader = new BufferedReader(new FileReader(input)); +// BufferedWriter writer = new BufferedWriter(new FileWriter(output)); +// String line; +// +// writer.write("length"); +// writer.newLine(); +// +// while ((line = reader.readLine()) != null) { +// String[] lineSplit = line.split(" "); +// writer.write(lineSplit.length +""); +// writer.newLine(); +// } +// +// writer.close(); +// reader.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } else { +// throw new RuntimeException(""); +// } +// } + + // public static void groupItemOccurrence(String output, Map mapItemOccurrence){ +// Map mapIntervalCount = new HashMap<>(); +// List listInterval = new ArrayList<>(); +// +// DecimalFormat nFormat = new DecimalFormat("#.##"); +// DecimalFormatSymbols dfs = new DecimalFormatSymbols(); +// dfs.setDecimalSeparator('.'); +// nFormat.setDecimalFormatSymbols(dfs); +// +// +// int nbInterval = 100; +// double max = 72178d +1; +// +// for(int i = 0; i < nbInterval; i++){ +// String interval = "["+ nFormat.format((max/nbInterval)*i) +";"+ nFormat.format((max/nbInterval)*(i+1)) +"["; +// listInterval.add(interval); +// mapIntervalCount.put(interval, 0); +// } +// +// for(Integer itemId : mapItemOccurrence.keySet()){ +// String interval = listInterval.get((int) (mapItemOccurrence.get(itemId)/(max/nbInterval))); +// +// mapIntervalCount.put(interval, mapIntervalCount.getOrDefault(interval, 0)+1); +// } +// +// try{ +// BufferedWriter writer = new BufferedWriter(new FileWriter(output)); +// +// writer.write("interval,occurrence"); +// writer.newLine(); +// +// for(String key : listInterval){ +// if(!mapIntervalCount.get(key).equals(0)) { +// writer.write(key + "," + mapIntervalCount.get(key)); +// writer.newLine(); +// } +// } +// +// writer.close(); +// } catch (IOException e) { +// e.printStackTrace(); +// } +// } +} diff --git a/src/main/java/main/DatasetConverter.java b/src/main/java/main/DatasetConverter.java new file mode 100644 index 0000000..d571964 --- /dev/null +++ b/src/main/java/main/DatasetConverter.java @@ -0,0 +1,284 @@ +package main; + +import com.opencsv.CSVReader; + +import java.io.*; +import java.util.*; + +public class DatasetConverter { + + public static void convertCSVIntoTransaction(String input, String output){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(input)); + BufferedWriter writer = new BufferedWriter(new FileWriter(output)); + + String line; + int idOrder; + int idProduct; + + + int lastIdOrder = -1; + boolean firstTransaction = true; + + // Delete first line with the header. + reader.readLine(); + + while ((line = reader.readLine()) != null){ + String[] lines = line.split(","); + + idOrder = Integer.valueOf(lines[0]); + idProduct = Integer.valueOf(lines[1]); + + + if(lastIdOrder != idOrder){ + if(firstTransaction){ + firstTransaction = false; + } else { + writer.write("\n"); + } + + lastIdOrder = idOrder; + } + + writer.write(idProduct + " "); + } + + reader.close(); + writer.close(); + } catch (IOException exception){ + exception.printStackTrace(); + } + } + + public static void replaceIdByAisle(String product, String dataset, String output){ + Map mapProductIdAisleId = new HashMap<>(); + + try{ + Set setAisleKept = new HashSet<>(); + BufferedReader reader;// = new BufferedReader(new FileReader(product)); + BufferedWriter writer;// = new BufferedWriter(new FileWriter(DataExplorer.workingDirectory + "replace.tmp")); + String line; + int idOrder; + int previousIdOrder = -1; + int idProduct; + boolean firstTransaction = true; + + CSVReader csvreader = new CSVReader(new FileReader(product)); + String[] nline; + + csvreader.readNext(); + while ((nline = csvreader.readNext()) != null) { + try { + mapProductIdAisleId.put(Integer.valueOf(nline[0]), Integer.valueOf(nline[2])); + } catch (NumberFormatException e){ + + for(String l : nline) { + System.out.print(l); + if(!l.equals(nline[nline.length-1])) { + System.out.print(","); + } + } + System.out.println(); + throw e; + } + } + + reader = new BufferedReader(new FileReader(dataset)); + writer = new BufferedWriter(new FileWriter(output)); + + reader.readLine(); + + while ((line = reader.readLine()) != null){ + String[] lines = line.split(","); + + idOrder = Integer.valueOf(lines[0]); + idProduct = Integer.valueOf(lines[1]); + + + if(previousIdOrder != idOrder){ + if(firstTransaction){ + firstTransaction = false; + } else { + + if(!setAisleKept.isEmpty()) { + for (Integer idAisle : setAisleKept) { + writer.write(idAisle.toString() + " "); + } + writer.write("\n"); + } + + setAisleKept.clear(); + } + + previousIdOrder = idOrder; + } + + setAisleKept.add(mapProductIdAisleId.get(idProduct)); + } + + // For the last transaction. + if(!setAisleKept.isEmpty()) { + for (Integer idAisle : setAisleKept) { + writer.write(idAisle.toString() + " "); + } + writer.write("\n"); + } + + reader.close(); + writer.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void sortTransaction(String transactionPath){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(transactionPath)); + StringBuilder stringBuilder = new StringBuilder(""); + String line; + + while ((line = reader.readLine()) != null){ + String[] linePart = line.split(" "); + SortedSet sortedSet = new TreeSet<>(); + + for(String item : linePart){ + sortedSet.add(Integer.valueOf(item)); + } + + for(Integer item_id : sortedSet){ + stringBuilder.append(item_id); + stringBuilder.append(' '); + } + + stringBuilder.append('\n'); + } + + reader.close(); + BufferedWriter writer = new BufferedWriter(new FileWriter(transactionPath)); + + writer.write(stringBuilder.toString()); + + writer.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void convertCSVIntoSequences(String rawSequenceDataset, String outputSequence, String outputInfo){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(rawSequenceDataset)); + BufferedWriter writerSequence = new BufferedWriter(new FileWriter(outputSequence)); + BufferedWriter writerInfo = new BufferedWriter(new FileWriter(outputInfo)); + + String line; + StringBuilder sequenceBuilder = new StringBuilder(""); + int lineNumber = 0; + + Map mapNameId = new HashMap<>(); + int freeID = 1; + int lastCustomerID = -1; + int lastOrderNumber = -1; + boolean firstSequence = true; + + while ((line = reader.readLine()) != null){ + String[] lineParts = line.split("\t"); + Integer customerID = Integer.valueOf(lineParts[0]); + Integer orderNumber = Integer.valueOf(lineParts[1]); + + if(!customerID.equals(lastCustomerID)){ + if(firstSequence){ + firstSequence = false; + }else{ + sequenceBuilder.append("-2"); + writerSequence.write(sequenceBuilder.toString()); + writerSequence.newLine(); + + sequenceBuilder = new StringBuilder(""); + } + + lastCustomerID = customerID; + } else if(orderNumber <= lastOrderNumber){ + System.out.println("Line "+ lineNumber +" :c"); + } + + String[] items = lineParts[3].split(","); + for(String item : items){ + if(item.equals("")){ + break; + } + + Integer itemID = mapNameId.getOrDefault(item, -1); + + if(itemID.equals(-1)){ + mapNameId.put(item, freeID); + itemID = freeID; + freeID++; + } + + sequenceBuilder.append(itemID); + sequenceBuilder.append(' '); + } + sequenceBuilder.append("-1 "); + + lastOrderNumber = orderNumber; + lineNumber++; + } + + for(String item : new TreeSet<>(mapNameId.keySet())){ + writerInfo.write(item); + writerInfo.write(','); + writerInfo.write(mapNameId.get(item).toString()); + writerInfo.newLine(); + } + + reader.close(); + writerSequence.close(); + writerInfo.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } + + public static void sortSequences(String sequenceDataset){ + try{ + BufferedReader reader = new BufferedReader(new FileReader(sequenceDataset)); + StringBuilder stringBuilder = new StringBuilder(""); + String line; + + while ((line = reader.readLine()) != null){ + String[] linePart = line.split(" -1 "); + + for(String itemset : linePart) { + if(!itemset.contains("-2")) { + SortedSet sortedSet = new TreeSet<>(); + String[] items = itemset.split(" "); + + for (String item : items) { + sortedSet.add(Integer.valueOf(item)); + } + + for(Integer item_id : sortedSet){ + stringBuilder.append(item_id); + stringBuilder.append(' '); + } + } + + stringBuilder.append("-1 "); + } + + stringBuilder.delete(stringBuilder.lastIndexOf("-1"), stringBuilder.lastIndexOf("-1") +3); + + stringBuilder.append("-2\n"); + } + + reader.close(); + BufferedWriter writer = new BufferedWriter(new FileWriter(sequenceDataset)); + + writer.write(stringBuilder.toString()); + + writer.close(); + } catch (IOException e){ + e.printStackTrace(); + } + } +}