diff --git a/README.md b/README.md index c12bd7c..f550441 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -# OB-Tree \ No newline at end of file +# OB-Tree +OB-Tree: A New Write-Optimization Index on Out-of-Core Column-Store Databases + +## folder structure + +* basic + - fundamental classes +* exp_merge_ob + - data cleaning experiments using OB-Tree +* exp_merge_progressive + - data cleaning experiments using the progressive approach +* exp_select + - data select experiments +* exp_update + - data update experiments + +## contact + +Feng "George" Yu, Ph.D. +Assistant Professor +Dept. Computer Science and Information Systems +Youngstown State University +Youngstown, OH, 44555 +YSU Data Lab: http://datalab.ysu.edu/ +Email: fyu@ysu.edu + + diff --git a/basic/btree/BTree.java b/basic/btree/BTree.java new file mode 100644 index 0000000..a6f960b --- /dev/null +++ b/basic/btree/BTree.java @@ -0,0 +1,478 @@ +package basic.btree; + +import java.io.File; +import java.util.ArrayList; +import java.util.Scanner; +import java.io.IOException; + +import basic.btree.*; +import basic.util.DataRetriever; + +/** + * B-Tree implementation largely based on the implementation made in Algorithms + * book by Robert Sedgewick. Original code can be found in the book web + * site. + * + * @author cgavidia + * + * @param + * Type of the Search Key + * @param + * Type of the Value Stored + */ +@SuppressWarnings("unchecked") +public class BTree, Value> { + + /** + * Tree parameter. Every node must have at most M - 1 key-link pairs + */ +// public static int M = 16; //for small tests +// public static int M = 64; //for small tests + public static int M = 128; +// public static int M = 32768;//2^15 for MB level? +// public static int M = 8388608;//2^23 for GB level? +// public static int M = 1_000_000; +// public static int M = Integer.MAX_VALUE;//for large data test + + + protected Node root; + /** + * Height of B-Tree + */ + private int height; + /** + * Number of key-value pairs in B-Tree + */ + private int size; + + Key key_min;//the smallest key + Key key_max;//the maximum key + + public BTree() { + root = new Node(0); + } + + /** + * BTree size in node number + * To get space size, use toByte() and toKB() + */ + public int getSize() { + return size; + } + + public int getHeight() { + return height; + } + + /** + * Search for given key + * + * @param key + * Key to search + * @return Associated value; return null if no such key + */ + public Value get(Key key) { + if(size==0){ + return null; + } + if(key.compareTo(key_max)>0 || key.compareTo(key_min) <0){ + return null; + }else { + return search(root, key, height); + } + } + + /** + * @param treeHeight current subtree height + */ + private Value search(Node node, Key key, int treeHeight) { + Entry[] children = node.getChildrenArray(); + + // external node + if (treeHeight == 0) { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (equal(key, children[j].getKey())) { + return (Value) children[j].getValue(); + } + } + } + // internal node + else { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (j== node.getNumberOfChildren() - 1 || less(key, children[j + 1].getKey())) + return search(children[j].getNext(), key, treeHeight - 1); + } + } + return null; + } + + @SuppressWarnings("unchecked") + public void put(Key key) { + put(key, (Value) key); + } + + public String toString() { + return toString(root, height, "") + "\n"; + } + + private boolean less(Key k1, Key k2) { + return k1.compareTo(k2) < 0; + } + + private boolean equal(Key k1, Key k2) { + return k1.compareTo(k2) == 0; + } + + //================ Eric Jones - Begins ================================= + + public Value findReplace(Key key, Value val){ + return searchReplace(root, key, val, height); + } + + private Value searchReplace(Node node, Key key, Value val, int treeHeight) { + Entry[] children = node.getChildrenArray(); + + // external node + if (treeHeight == 0) { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (equal(key, children[j].getKey())) { + children[j].setValue(val); + return val; + } + } + } + + // internal node + else { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (j == node.getNumberOfChildren() - 1 || less(key, children[j+1].getKey())) + return searchReplace(children[j].getNext(), key, val, treeHeight - 1); + } + } + return null; + } + + //================ Eric Jones - Ends ================================= + + /** + * search for the minimal node + * fyu + */ + public Node getMinNode(){ + return searchMinNode(root, height); + } + private Node searchMinNode(Node node, int treeHeight) { + // external node + if (treeHeight == 0) { + return node; + } + // internal node + else { + return searchMinNode(node.getChildrenArray()[0].getNext(),treeHeight - 1); + } + } + + /** + * Inserts a Key-Value pair + * + * @param key + * Key to insert + * @param value + * Value to insert + */ + @SuppressWarnings("unchecked") + public void put(Key key, Value value) { + //automatically update key_max and key_min when inputing new keys + if(key_max==null || key_min==null){ + key_max=key_min=key;//initial status + }else if(key.compareTo(key_max)>0){ + key_max=key; + }else if(key.compareTo(key_min)<0){ + key_min=key; + } + + Node nodeFromSplit = insert(root, key, value, height); + size++; + if (nodeFromSplit == null) { + return; + } + + Node newRoot = new Node(2); + newRoot.getChildrenArray()[0] = new Entry(root.getChildrenArray()[0].getKey(), null, root); + newRoot.getChildrenArray()[1] = new Entry(nodeFromSplit.getChildrenArray()[0].getKey(), null, + nodeFromSplit); + root = newRoot; + height++; + } + + /** + * insert into subtree + * @param node current subtree root + * @param key + * @param value + * @param treeHeight current subtree height + * @return + */ + private Node insert(Node node, Key key, Value value, int treeHeight) { + int newEntryPosition; + Entry entryToInsert = new Entry(key, value, null); + // external node + if (treeHeight == 0) { + for (newEntryPosition = 0; newEntryPosition < node + .getNumberOfChildren(); newEntryPosition++) { + if (less(key, node.getChildrenArray()[newEntryPosition].getKey())) { + break; + } + } + } + // internal node + else { + for (newEntryPosition = 0; newEntryPosition < node.getNumberOfChildren(); newEntryPosition++) { + if ((newEntryPosition == node.getNumberOfChildren()-1) || less(key,node.getChildrenArray()[newEntryPosition + 1].getKey())){ + Node nodeFromSplit = + insert(node.getChildrenArray()[newEntryPosition++].getNext(),key, value, treeHeight - 1); + if (nodeFromSplit == null) { + return null; + } + entryToInsert.setKey(nodeFromSplit.getChildrenArray()[0].getKey()); + entryToInsert.setNext(nodeFromSplit); + break; + } + } + } + //set ONLY leaf node[M-1] to point to its next sibling node + Node nextTemp=null;//temp pointer + if(treeHeight==0 && node.getChildrenArray()[M-1]!=null){ + nextTemp=node.getChildrenArray()[M-1].getNext();//if current node is full and next is pointing to a sibling node + } + for (int i = node.getNumberOfChildren(); i > newEntryPosition; i--) { + //move entry one step backward + //if the current node is full and pointing to a sibling node, this will clean the next pointer + node.getChildrenArray()[i] = node.getChildrenArray()[i - 1]; + } + node.getChildrenArray()[newEntryPosition] = entryToInsert;//if the current node is full and pointing to a sibling node, this will clean the next pointer + node.setNumberOfChildren(node.getNumberOfChildren() + 1); + if(treeHeight==0 && node.getChildrenArray()[M-1]!=null && nextTemp!=null){ + node.getChildrenArray()[M-1].setNext(nextTemp);//restore the last pointer to the sibling node + } + if (node.getNumberOfChildren() < M) { + return null; + } else { + if(treeHeight!=0){ + return splitInternal(node); + } + else{ + return splitLeaf(node); + } + + } + } + + /** + * Splits node in half + * internal nodes set the next pointer to the sibling node + * + * @param oldNode + * The Node to Split + */ + private Node splitLeaf(Node oldNode) { + Node newNode = new Node(M / 2); + oldNode.setNumberOfChildren(M / 2); + for (int j = 0; j < M / 2; j++) { + newNode.getChildrenArray()[j] = oldNode.getChildrenArray()[M/ 2 + j]; + } + newNode.getChildrenArray()[M-1]=new Entry(null,null,oldNode.getChildrenArray()[M-1].getNext()); + + //clean unused space + for (int j = 0; j < M / 2-1; j++) { + oldNode.getChildrenArray()[M/ 2 + j]=null; + } + oldNode.getChildrenArray()[M-1]=new Entry(null,null,newNode); + return newNode; + } + + /** + * split root is different from internal + */ + private Node splitInternal(Node oldNode) { + Node newNode = new Node(M / 2); + oldNode.setNumberOfChildren(M / 2); + for (int j = 0; j < M / 2; j++) { + newNode.getChildrenArray()[j] = oldNode.getChildrenArray()[M + / 2 + j]; + } + return newNode; + } + + /** + * output all leaf entries + * @return + */ + public ArrayList> getLeafEntryList() { + ArrayList> entry_list=new ArrayList>(1024); + Node current_node=this.getMinNode(); + Entry[] children_array = current_node.getChildrenArray(); + while(true){ + for(int j=0; j currentNode, int ht, String indent) { + String outputString = ""; + if(currentNode==null || currentNode.getChildrenArray()==null){ + return ""; + } + Entry[] childrenArray = currentNode.getChildrenArray(); + + if (ht == 0) { + for (int j = 0; j < currentNode.getNumberOfChildren(); j++) { + outputString += indent + childrenArray[j].getKey() + " " + + childrenArray[j].getValue() + "\n"; + } + } else { + int num_children=currentNode.getNumberOfChildren(); + outputString += toString(childrenArray[0].getNext(), ht - 1, indent + " "); + outputString += indent + "[" + childrenArray[0].getKey() + "\n"; + if(num_children>=2){ + for (int j = 1; j < num_children - 1; j++) { + outputString += indent + "(" + childrenArray[j].getKey() + ")\n"; + outputString += toString(childrenArray[j].getNext(), ht - 1, indent + " "); + } + } + outputString += indent + "(" + childrenArray[num_children-1].getKey() + ")]\n"; + outputString += toString(childrenArray[num_children-1].getNext(), ht - 1, indent + " "); + } + return outputString; + } + + private int toByte(Node currentNode, int treeHeight){ + int space=0; + if(currentNode==null || currentNode.getChildrenArray()==null){ + return 0; + } + Entry[] childrenArray = currentNode.getChildrenArray(); + if(treeHeight==0){ + for (int j = 0; j < currentNode.getNumberOfChildren(); j++) { + space+=Entry.key_size+Entry.value_size;//space for key and value + } + if(currentNode.getChildrenArray()[M-1]!=null && currentNode.getChildrenArray()[M-1].getNext()!=null){ + space+=Entry.pointer_size;//pointer to sibling + } + }else{ + int num_children=currentNode.getNumberOfChildren(); + space+=Entry.pointer_size;//first pointer to left child + space+=toByte(childrenArray[0].getNext(),treeHeight-1); + if(num_children>=2){ + for (int j=1; j node, Key key, Value val, int treeHeight) { +Entry[] children = node.getChildrenArray(); + +// external node +if (treeHeight == 0) { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (equal(key, children[j].getKey())) { + children[j].setKey(null); + children[j].setValue(null); + + return val; //perhaps this should be return null, PERHAPS!!!!!! + } + } +} + +// internal node +else { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (node.getNumberOfChildren() == j + 1 + || less(key, children[j + 1].getKey())) + return searchRemove(children[j].getNext(), key, val, treeHeight - 1); + } +} +return null; +} + +public Value findReplace(Key key, Value val){ +return searchReplace(root, key, val, height); +} + +private Value searchReplace(Node node, Key key, Value val, int treeHeight) { +Entry[] children = node.getChildrenArray(); + +// external node +if (treeHeight == 0) { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (equal(key, children[j].getKey())) { + children[j].setValue(val); + return val; + } + } +} + +// internal node +else { + for (int j = 0; j < node.getNumberOfChildren(); j++) { + if (node.getNumberOfChildren() == j + 1 + || less(key, children[j + 1].getKey())) + return searchReplace(children[j].getNext(), key, val, treeHeight - 1); + } +} +return null; +} + +*/ \ No newline at end of file diff --git a/basic/btree/Entry.java b/basic/btree/Entry.java new file mode 100644 index 0000000..df6d68c --- /dev/null +++ b/basic/btree/Entry.java @@ -0,0 +1,60 @@ +package basic.btree; + +/** + * Entry in a node. Internal nodes only use key and next while External Nodes + * use key and value + * + * @author cgavidia + * + * @param + * Type of the Search Key + * @param + * Type of the Value Stored + */ +public class Entry, Value> { + + private Key key; + private Value value; + private Node next; // Helper field to iterate over array entries + //--for space calculation--beginning + public static final int key_size=4; + public static final int value_size=4; + public static final int pointer_size=4; + //--for space calculation--end + + public Entry(Key key, Value value, Node next) { + this.key = key; + this.value = value; + this.next = next; + } + + public Key getKey() { + return key; + } + + public void setKey(Key key) { + this.key = key; + } + + public Value getValue() { + return value; + } + + public void setValue(Value value) { + this.value = value; + } + + public Node getNext() { + return next; + } + + public void setNext(Node next) { + this.next = next; + } + + @Override + public String toString() { + return "(Key: " + key + " Value: " + value + ")"; + } + +} \ No newline at end of file diff --git a/basic/btree/Node.java b/basic/btree/Node.java new file mode 100644 index 0000000..07b3a00 --- /dev/null +++ b/basic/btree/Node.java @@ -0,0 +1,54 @@ +package basic.btree; +import java.util.ArrayList; + +/** + * B-Tree Node data type + * + * @author cgavidia + * + */ +@SuppressWarnings("unchecked") +public class Node, Value> { + + private int numberOfChildren; + private Entry[] childrenArray = new Entry[BTree.M]; +// private ArrayList> childrenArray=new ArrayList<>(BTree.M); + + /** + * Creates a node with k children + * + * @param k + * number of children + */ + public Node(int k) { + numberOfChildren = k; + } + + public Entry[] getChildrenArray() { + return childrenArray; + } + + public void setChildrenArray(Entry[] children) { + this.childrenArray = children; + } + + public int getNumberOfChildren() { + return numberOfChildren; + } + + public void setNumberOfChildren(int childrenNumber) { + this.numberOfChildren = childrenNumber; + } + + @Override + public String toString() { + String result = "{ "; + for (int i = 0; i < numberOfChildren; i++) { + Entry entry = childrenArray[i]; + result = result + entry.toString() + ", "; + } + result = result + " }"; + return result; + } + +} \ No newline at end of file diff --git a/basic/btree/OBTree.java b/basic/btree/OBTree.java new file mode 100644 index 0000000..bd579d5 --- /dev/null +++ b/basic/btree/OBTree.java @@ -0,0 +1,132 @@ +package basic.btree; + +import java.io.*; +import java.util.*; +import java.util.concurrent.*; + +import basic.util.DataRetriever; +import basic.storage_model.TBAT; + +/** + * OBTree use Long oid, and Long offset to insert into BTree + */ +@SuppressWarnings("unchecked") +public class OBTree extends BTree { + long total_inserts; +// public OBTree(){ +// super(); +// key_max=key_min=0L;//in OBTree initially both max and min keys (oid) are 0 +// } + + public long loadUpdateFile(String update_file_name) throws IOException{//if file read had timestamp use this + this.total_inserts=0; + long off=1; + String a; + long b; + long oid; + Scanner reads = new Scanner(new File(update_file_name)); + while (reads.hasNext()) { + a = reads.next(); // read OID + b = reads.nextLong(); // read VALUE + a = a.substring(0, a.length() - 1); // removing the comma that was auto-generated + oid = Long.parseLong(a); // placing that number into a variable + if (get(oid) != null) { + findReplace(oid, off); + } else { + put(oid, off); + total_inserts++; + }// end of if-else + off++; + } + reads.close(); + return total_inserts; + } + + /** + * load update the appendix of an updated file into a new BTree + */ + public OBTree loadAppendixIntoOBTree(String update_file_name) throws IOException{ + return new OBTree().loadAppendixIntoOBTree(update_file_name); + } + + /** + * + * @param update_file_name + * @param line_width + * @param start_line_num >=1 + * @param end_line_num >=1 + * @return + * @throws IOException + */ + public long loadAppendixRangeIntoOBTree(String update_file_name, int line_width, long start_line_num, long end_line_num) throws IOException{ + total_inserts=0; + long current_line_num=start_line_num; + BufferedReader input_file=new BufferedReader(new FileReader(update_file_name)); + input_file.skip((start_line_num-1)*line_width);//skip first start_line_num - 1 lines + String current_line, a, b; + long oid; + long off=current_line_num;//offset starts with current line num in the update file + while((current_line=input_file.readLine())!=null && current_line_num <= end_line_num){ + //only take the 1st part of "oid, val" after split and convert to long oid + oid=Long.parseLong(current_line.split(",")[0].trim()); + if(get(oid)!=null){//if this oid already exists in obtree + findReplace(oid,off);//replace with new offset + } else {//o.w. insert this new oid + put(oid, off); + total_inserts++; + } + off++; + current_line_num++; + } + input_file.close(); + return total_inserts; + } + + public long getTotal_inserts(){return total_inserts;} + + public long searchKey(long oid){ + Long offset=get(oid); + if(offset!=null){ + return offset; + }else{ + return DataRetriever.NOT_FOUND; + } + } + + /** + * obtree selection experiment using a selection file + * @param tbat_file_name + * @param select_file_name + * @param num_lines_body + * @param tbat_line_length + * @param search_value if true the searching for value by offset will be used + * @throws IOException + */ + public void searchSelectionFile(String tbat_file_name, String select_file_name, long num_lines_body, int tbat_line_length, boolean search_value) throws IOException{ + BufferedReader select_file=new BufferedReader(new FileReader(select_file_name)); + RandomAccessFile tbat_file=new RandomAccessFile(new File(tbat_file_name), "r"); + String str; + long target_oid; + long offset; + long value; + while((str=select_file.readLine())!=null && str.length()!=0){ + target_oid=Long.parseLong(str); + offset=searchKey(target_oid); + if(search_value) { + if (offset != DataRetriever.NOT_FOUND) { + value = TBAT.searchAppendixByOffSet(tbat_file, num_lines_body, tbat_line_length, offset, 2);//in a tbat, value is at 2 (3rd position in one line) + //out.println("***found in obtree: oid="+target_oid+" | value="+value); + } else { + value = TBAT.selectTBAT_body(tbat_file,num_lines_body,tbat_line_length,target_oid); + } + } + } + tbat_file.close(); + select_file.close(); + } + +} + + + + diff --git a/basic/btree/OBTreeInt.java b/basic/btree/OBTreeInt.java new file mode 100644 index 0000000..a77825b --- /dev/null +++ b/basic/btree/OBTreeInt.java @@ -0,0 +1,84 @@ +package basic.btree; + +import java.io.File; +import java.io.IOException; +import java.util.Scanner; + +@SuppressWarnings("unchecked") +public class OBTreeInt extends BTree { + public int total_inserts=0; + public int loadUpdateFile(String update_file_name) throws IOException{//if file read had timestamp use this + int OFF=1; + String a; + int b; + int valueOfA; + Scanner reads = new Scanner(new File(update_file_name)); + while (reads.hasNext()) { + a = reads.next(); // read OID + b = reads.nextInt(); // read VALUE + a = a.substring(0, a.length() - 1); // removing the comma that was auto-generated + valueOfA = Integer.parseInt(a); // placing that number into a variable + if (get(valueOfA) != null) { + findReplace(valueOfA, OFF); + } else { + put(valueOfA, OFF); + total_inserts++; + }// end of if-else + OFF++; + } + reads.close(); + return total_inserts; + } + + /** + * load update the appendix of an updated file into a new BTree + */ + public OBTreeInt loadAppendixIntoOBTree(String update_file_name) throws IOException{ + OBTreeInt appendixBTree = new OBTreeInt(); + int OFF=1; + String a; + int b; + int valueOfA; + Scanner reads = new Scanner(new File(update_file_name)); + while (reads.hasNext()) { + a = reads.next(); // read OID + b = reads.nextInt(); // read VALUE + a = a.substring(0, a.length() - 1); // removing the comma that was auto-generated + valueOfA = Integer.parseInt(a); // placing that number into a variable + if (appendixBTree.get(valueOfA) != null) { + appendixBTree.findReplace(valueOfA, OFF); + } else { + appendixBTree.put(valueOfA, OFF); + }// end of if-else + OFF++; + } + reads.close(); + return appendixBTree; + } + +} + +//public int bulkLoadUpdateFile2(String update_file_name) throws FileNotFoundException{ +//Scanner reads = new Scanner(new File(update_file_name));// reader for update file +//Integer offset = 1; // offset starts from 1 +//String line; // buffer for reach each line +//String[] line_vector; // tokenized line +//Integer oid; // each oid +//while (reads.hasNext()) { +// line = reads.nextLine(); +// line_vector = line.split(","); +// oid = Integer.parseInt(line_vector[1].trim()); +// if (findReplace((Key)oid, (Value) offset) != null) { +//// System.out.println("\nKey " + oid + " already exists. Update offset to " + offset + "."); +// } else { +//// System.out.println("A new key: " + oid + " \t\tInserting at offset: " + offset + "."); +// put((Key) oid, (Value) offset); +// } +// offset++; +//} +//reads.close(); +//this.total_inserts=offset.intValue()-1; +//return total_inserts; +//} + + diff --git a/basic/external_sorting/ExternalSort.java b/basic/external_sorting/ExternalSort.java new file mode 100644 index 0000000..19fce3b --- /dev/null +++ b/basic/external_sorting/ExternalSort.java @@ -0,0 +1,681 @@ +package basic.external_sorting; + + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.PriorityQueue; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; +import java.util.zip.Deflater; + +/** + * reference: http://grepcode.com/file/repo1.maven.org/maven2/com.google.code.externalsortinginjava/externalsortinginjava/0.1.8/com/google/code/externalsorting/ExternalSort.java + * +* Goal: offer a generic external-memory sorting program in Java. +* +* It must be : - hackable (easy to adapt) - scalable to large files - sensibly +* efficient. +* +* This software is in the public domain. +* +* Usage: java com/google/code/external_sorting/ExternalSort somefile.txt out.txt +* +* You can change the default maximal number of temporary files with the -t +* flag: java com/google/code/external_sorting/ExternalSort somefile.txt out.txt +* -t 3 +* +* For very large files, you might want to use an appropriate flag to allocate +* more memory to the Java VM: java -Xms2G +* com/google/code/external_sorting/ExternalSort somefile.txt out.txt +* +* By (in alphabetical order) Philippe Beaudoin, Eleftherios Chetzakis, Jon +* Elsas, Christan Grant, Daniel Haran, Daniel Lemire, Sugumaran Harikrishnan, +* Jerry Yang, First published: April 2010 originally posted at +* http://lemire.me/blog/archives/2010/04/01/external-memory-sorting-in-java/ +*/ +public class ExternalSort { + + /* + * This sorts a file (input) to an output file (output) using + * default parameters + * + * @param file + * source file + * + * @param file + * output file + * + */ + public static void sort(File input, File output) throws IOException { + ExternalSort.mergeSortedFiles(ExternalSort.sortInBatch(input),output); + } + + + + static int DEFAULTMAXTEMPFILES = 1024; + + // we divide the file into small blocks. If the blocks + // are too small, we shall create too many temporary files. + // If they are too big, we shall be using too much memory. + public static long estimateBestSizeOfBlocks(File filetobesorted, + int maxtmpfiles) { + long sizeoffile = filetobesorted.length() * 2; + /** + * We multiply by two because later on someone insisted on + * counting the memory usage as 2 bytes per character. By this + * model, loading a file with 1 character will use 2 bytes. + */ + // we don't want to open up much more than maxtmpfiles temporary + // files, better run + // out of memory first. + long blocksize = sizeoffile / maxtmpfiles + + (sizeoffile % maxtmpfiles == 0 ? 0 : 1); + + // on the other hand, we don't want to create many temporary + // files + // for naught. If blocksize is smaller than half the free + // memory, grow it. + long freemem = Runtime.getRuntime().freeMemory(); + if (blocksize < freemem / 2) { + blocksize = freemem / 2; + } + return blocksize; + } + + + /** + * This will simply load the file by blocks of lines, then sort them + * in-memory, and write the result to temporary files that have to be + * merged later. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @return a list of temporary flat files + */ + public static List sortInBatch(File file) + throws IOException { + return sortInBatch(file, defaultcomparator, DEFAULTMAXTEMPFILES, + Charset.defaultCharset(), null, false); + } + /** + * This will simply load the file by blocks of lines, then sort them + * in-memory, and write the result to temporary files that have to be + * merged later. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @return a list of temporary flat files + */ + public static List sortInBatch(File file, Comparator cmp) + throws IOException { + return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES, + Charset.defaultCharset(), null, false); + } + + /** + * This will simply load the file by blocks of lines, then sort them + * in-memory, and write the result to temporary files that have to be + * merged later. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @param distinct + * Pass true if duplicate lines should be + * discarded. + * @return a list of temporary flat files + */ + public static List sortInBatch(File file, Comparator cmp, + boolean distinct) throws IOException { + return sortInBatch(file, cmp, DEFAULTMAXTEMPFILES, + Charset.defaultCharset(), null, distinct); + } + + /** + * This will simply load the file by blocks of lines, then sort them + * in-memory, and write the result to temporary files that have to be + * merged later. You can specify a bound on the number of temporary + * files that will be created. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @param maxtmpfiles + * maximal number of temporary files + * @param Charset + * character set to use (can use + * Charset.defaultCharset()) + * @param tmpdirectory + * location of the temporary files (set to null for + * default location) + * @param distinct + * Pass true if duplicate lines should be + * discarded. + * @param numHeader + * number of lines to preclude before sorting starts + * @parame usegzip use gzip compression for the temporary files + * @return a list of temporary flat files + */ + public static List sortInBatch(File file, Comparator cmp, + int maxtmpfiles, Charset cs, File tmpdirectory, + boolean distinct, int numHeader, boolean usegzip) + throws IOException { + List files = new ArrayList(); + BufferedReader fbr = new BufferedReader(new InputStreamReader( + new FileInputStream(file), cs)); + long blocksize = estimateBestSizeOfBlocks(file, maxtmpfiles);// in + // bytes + + try { + List tmplist = new ArrayList(); + String line = ""; + try { + int counter = 0; + while (line != null) { + long currentblocksize = 0;// in bytes + while ((currentblocksize < blocksize) + && ((line = fbr.readLine()) != null)) { + // as long as you have enough memory + if (counter < numHeader) { + counter++; + continue; + } + tmplist.add(line); + // ram usage estimation, not + // very accurate, still more + // realistic that the simple 2 * + // String.length + currentblocksize += StringSizeEstimator + .estimatedSizeOf(line); + } + files.add(sortAndSave(tmplist, cmp, cs, + tmpdirectory, distinct, usegzip)); + tmplist.clear(); + } + } catch (EOFException oef) { + if (tmplist.size() > 0) { + files.add(sortAndSave(tmplist, cmp, cs, + tmpdirectory, distinct, usegzip)); + tmplist.clear(); + } + } + } finally { + fbr.close(); + } + return files; + } + + + /** + * This will simply load the file by blocks of lines, then sort them + * in-memory, and write the result to temporary files that have to be + * merged later. You can specify a bound on the number of temporary + * files that will be created. + * + * @param file + * some flat file + * @param cmp + * string comparator + * @param maxtmpfiles + * maximal number of temporary files + * @param Charset + * character set to use (can use + * Charset.defaultCharset()) + * @param tmpdirectory + * location of the temporary files (set to null for + * default location) + * @param distinct + * Pass true if duplicate lines should be + * discarded. + * @return a list of temporary flat files + */ + public static List sortInBatch(File file, Comparator cmp, + int maxtmpfiles, Charset cs, File tmpdirectory, boolean distinct) + throws IOException { + return sortInBatch(file, cmp, maxtmpfiles, cs, tmpdirectory, + distinct, 0, false); + } + + /** + * Sort a list and save it to a temporary file + * + * @return the file containing the sorted data + * @param tmplist + * data to be sorted + * @param cmp + * string comparator + * @param cs + * charset to use for output (can use + * Charset.defaultCharset()) + * @param tmpdirectory + * location of the temporary files (set to null for + * default location) + * @param distinct + * Pass true if duplicate lines should be + * discarded. + */ + public static File sortAndSave(List tmplist, + Comparator cmp, Charset cs, File tmpdirectory, + boolean distinct, boolean usegzip) throws IOException { + Collections.sort(tmplist, cmp); + File newtmpfile = File.createTempFile("sortInBatch", + "flatfile", tmpdirectory); + newtmpfile.deleteOnExit(); + OutputStream out = new FileOutputStream(newtmpfile); + int ZIPBUFFERSIZE = 2048; + if (usegzip) + out = new GZIPOutputStream(out, ZIPBUFFERSIZE) { + { + def.setLevel(Deflater.BEST_SPEED); + } + }; + BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter( + out, cs)); + String lastLine = null; + try { + for (String r : tmplist) { + // Skip duplicate lines + if (!distinct || !r.equals(lastLine)) { + fbw.write(r); + fbw.newLine(); + lastLine = r; + } + } + } finally { + fbw.close(); + } + return newtmpfile; + } + + /** + * Sort a list and save it to a temporary file + * + * @return the file containing the sorted data + * @param tmplist + * data to be sorted + * @param cmp + * string comparator + * @param cs + * charset to use for output (can use + * Charset.defaultCharset()) + * @param tmpdirectory + * location of the temporary files (set to null for + * default location) + */ + public static File sortAndSave(List tmplist, + Comparator cmp, Charset cs, File tmpdirectory) + throws IOException { + return sortAndSave(tmplist, cmp, cs, tmpdirectory, false, false); + } + /** + * This merges a bunch of temporary flat files + * + * @param files + * @param output + * file + * @return The number of lines sorted. (P. Beaudoin) + */ + public static int mergeSortedFiles(List files, File outputfile) throws IOException { + return mergeSortedFiles(files, outputfile, defaultcomparator, + Charset.defaultCharset()); + } + /** + * This merges a bunch of temporary flat files + * + * @param files + * @param output + * file + * @return The number of lines sorted. (P. Beaudoin) + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp) throws IOException { + return mergeSortedFiles(files, outputfile, cmp, + Charset.defaultCharset()); + } + + /** + * This merges a bunch of temporary flat files + * + * @param files + * @param output + * file + * @return The number of lines sorted. (P. Beaudoin) + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp, boolean distinct) + throws IOException { + return mergeSortedFiles(files, outputfile, cmp, + Charset.defaultCharset(), distinct); + } + + /** + * This merges a bunch of temporary flat files + * + * @param files + * The {@link List} of sorted {@link File}s to be merged. + * @param Charset + * character set to use to load the strings + * @param distinct + * Pass true if duplicate lines should be + * discarded. (elchetz@gmail.com) + * @param outputfile + * The output {@link File} to merge the results to. + * @param cmp + * The {@link Comparator} to use to compare + * {@link String}s. + * @param cs + * The {@link Charset} to be used for the byte to + * character conversion. + * @param append + * Pass true if result should append to + * {@link File} instead of overwrite. Default to be false + * for overloading methods. + * @param usegzip + * assumes we used gzip compression for temporary files + * @return The number of lines sorted. (P. Beaudoin) + * @since v0.1.4 + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp, Charset cs, boolean distinct, + boolean append, boolean usegzip) throws IOException { + PriorityQueue pq = new PriorityQueue( + 11, new Comparator() { + @Override + public int compare(BinaryFileBuffer i, + BinaryFileBuffer j) { + return cmp.compare(i.peek(), j.peek()); + } + }); + ArrayList bfbs = new ArrayList(); + for (File f : files) { + final int BUFFERSIZE = 2048; + InputStream in = new FileInputStream(f); + BufferedReader br; + if (usegzip) { + br = new BufferedReader(new InputStreamReader( + new GZIPInputStream(in, BUFFERSIZE), cs)); + } else { + br = new BufferedReader(new InputStreamReader(in, + cs)); + } + + BinaryFileBuffer bfb = new BinaryFileBuffer(br); + bfbs.add(bfb); + } + BufferedWriter fbw = new BufferedWriter(new OutputStreamWriter( + new FileOutputStream(outputfile, append), cs)); + int rowcounter = merge(fbw,cmp,distinct, bfbs); + for (File f : files) f.delete(); + return rowcounter; + } + /** + * This merges several BinaryFileBuffer to an output writer. + * + * @param BufferedWriter + * A buffer where we write the data. + * @param cmp + * A comparator object that tells us how to sort the lines. + * @param distinct + * Pass true if duplicate lines should be + * discarded. (elchetz@gmail.com) + * @param buffers + * Where the data should be read. + * @return The number of lines sorted. (P. Beaudoin) + * + */ + public static int merge(BufferedWriter fbw, final Comparator cmp, boolean distinct, List buffers) throws IOException { + PriorityQueue pq = new PriorityQueue( + 11, new Comparator() { + @Override + public int compare(BinaryFileBuffer i, + BinaryFileBuffer j) { + return cmp.compare(i.peek(), j.peek()); + } + }); + for (BinaryFileBuffer bfb: buffers) + if(!bfb.empty()) + pq.add(bfb); + int rowcounter = 0; + String lastLine = null; + try { + while (pq.size() > 0) { + BinaryFileBuffer bfb = pq.poll(); + String r = bfb.pop(); + // Skip duplicate lines + if (!distinct || !r.equals(lastLine)) { + fbw.write(r); + fbw.newLine(); + lastLine = r; + } + ++rowcounter; + if (bfb.empty()) { + bfb.fbr.close(); + } else { + pq.add(bfb); // add it back + } + } + } finally { + fbw.close(); + for (BinaryFileBuffer bfb : pq) + bfb.close(); + } + return rowcounter; + + } + + /** + * This merges a bunch of temporary flat files + * + * @param files + * The {@link List} of sorted {@link File}s to be merged. + * @param Charset + * character set to use to load the strings + * @param distinct + * Pass true if duplicate lines should be + * discarded. (elchetz@gmail.com) + * @param outputfile + * The output {@link File} to merge the results to. + * @param cmp + * The {@link Comparator} to use to compare + * {@link String}s. + * @param cs + * The {@link Charset} to be used for the byte to + * character conversion. + * @return The number of lines sorted. (P. Beaudoin) + * @since v0.1.2 + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp, Charset cs, boolean distinct) + throws IOException { + return mergeSortedFiles(files, outputfile, cmp, cs, distinct, + false, false); + } + + /** + * This merges a bunch of temporary flat files + * + * @param files + * @param output + * file + * @param Charset + * character set to use to load the strings + * @return The number of lines sorted. (P. Beaudoin) + */ + public static int mergeSortedFiles(List files, File outputfile, + final Comparator cmp, Charset cs) throws IOException { + return mergeSortedFiles(files, outputfile, cmp, cs, false); + } + + public static void displayUsage() { + System.out + .println("java com.google.external_sorting.ExternalSort inputfile outputfile"); + System.out.println("Flags are:"); + System.out.println("-v or --verbose: verbose output"); + System.out.println("-d or --distinct: prune duplicate lines"); + System.out + .println("-t or --maxtmpfiles (followed by an integer): specify an upper bound on the number of temporary files"); + System.out + .println("-c or --charset (followed by a charset code): specify the character set to use (for sorting)"); + System.out + .println("-z or --gzip: use compression for the temporary files"); + System.out + .println("-H or --header (followed by an integer): ignore the first few lines"); + System.out + .println("-s or --store (following by a path): where to store the temporary files"); + System.out.println("-h or --help: display this message"); + } + + public static void main(String[] args) throws IOException { + boolean verbose = false; + boolean distinct = false; + int maxtmpfiles = DEFAULTMAXTEMPFILES; + Charset cs = Charset.defaultCharset(); + String inputfile = null, outputfile = null; + File tempFileStore = null; + boolean usegzip = false; + int headersize = 0; + for (int param = 0; param < args.length; ++param) { + if (args[param].equals("-v") + || args[param].equals("--verbose")) { + verbose = true; + } else if ((args[param].equals("-h") || args[param] + .equals("--help"))) { + displayUsage(); + return; + } else if ((args[param].equals("-d") || args[param] + .equals("--distinct"))) { + distinct = true; + } else if ((args[param].equals("-t") || args[param] + .equals("--maxtmpfiles")) + && args.length > param + 1) { + param++; + maxtmpfiles = Integer.parseInt(args[param]); + if (headersize < 0) { + System.err + .println("maxtmpfiles should be positive"); + } + } else if ((args[param].equals("-c") || args[param] + .equals("--charset")) + && args.length > param + 1) { + param++; + cs = Charset.forName(args[param]); + } else if ((args[param].equals("-z") || args[param] + .equals("--gzip"))) { + usegzip = true; + } else if ((args[param].equals("-H") || args[param] + .equals("--header")) && args.length > param + 1) { + param++; + headersize = Integer.parseInt(args[param]); + if (headersize < 0) { + System.err + .println("headersize should be positive"); + } + } else if ((args[param].equals("-s") || args[param] + .equals("--store")) && args.length > param + 1) { + param++; + tempFileStore = new File(args[param]); + } else { + if (inputfile == null) + inputfile = args[param]; + else if (outputfile == null) + outputfile = args[param]; + else + System.out.println("Unparsed: " + + args[param]); + } + } + if (outputfile == null) { + System.out + .println("please provide input and output file names"); + displayUsage(); + return; + } + Comparator comparator = defaultcomparator; + List l = sortInBatch(new File(inputfile), comparator, + maxtmpfiles, cs, tempFileStore, distinct, headersize, + usegzip); + if (verbose) + System.out + .println("created " + l.size() + " tmp files"); + mergeSortedFiles(l, new File(outputfile), comparator, cs, + distinct, false, usegzip); + } + + public static Comparator defaultcomparator = new Comparator() { + @Override + public int compare(String r1, String r2) { + return r1.compareTo(r2); + } + }; + +} + + +class BinaryFileBuffer { + public BufferedReader fbr; + private String cache; + private boolean empty; + + public BinaryFileBuffer(BufferedReader r) + throws IOException { + this.fbr = r; + reload(); + } + + public boolean empty() { + return this.empty; + } + + private void reload() throws IOException { + try { + if ((this.cache = this.fbr.readLine()) == null) { + this.empty = true; + this.cache = null; + } else { + this.empty = false; + } + } catch (EOFException oef) { + this.empty = true; + this.cache = null; + } + } + + public void close() throws IOException { + this.fbr.close(); + } + + public String peek() { + if (empty()) + return null; + return this.cache.toString(); + } + + public String pop() throws IOException { + String answer = peek(); + reload(); + return answer; + } + +} + diff --git a/basic/external_sorting/StringSizeEstimator.java b/basic/external_sorting/StringSizeEstimator.java new file mode 100644 index 0000000..83b6c15 --- /dev/null +++ b/basic/external_sorting/StringSizeEstimator.java @@ -0,0 +1,66 @@ +package basic.external_sorting; + + +/** + * reference: http://grepcode.com/file_/repo1.maven.org/maven2/com.google.code.externalsortinginjava/externalsortinginjava/0.1.8/com/google/code/externalsorting/StringSizeEstimator.java/?v=source + * + * @author Eleftherios Chetzakis + * + */ +public final class StringSizeEstimator { + + private static int OBJ_HEADER; + private static int ARR_HEADER; + private static int INT_FIELDS = 12; + private static int OBJ_REF; + private static int OBJ_OVERHEAD; + private static boolean IS_64_BIT_JVM; + + /** + * Private constructor to prevent instantiation. + */ + private StringSizeEstimator() { + } + + /** + * Class initializations. + */ + static { + // By default we assume 64 bit JVM + // (defensive approach since we will get + // larger estimations in case we are not sure) + IS_64_BIT_JVM = true; + // check the system property "sun.arch.data.model" + // not very safe, as it might not work for all JVM implementations + // nevertheless the worst thing that might happen is that the JVM is 32bit + // but we assume its 64bit, so we will be counting a few extra bytes per string object + // no harm done here since this is just an approximation. + String arch = System.getProperty("sun.arch.data.model"); + if (arch != null) { + if (arch.indexOf("32") != -1) { + // If exists and is 32 bit then we assume a 32bit JVM + IS_64_BIT_JVM = false; + } + } + // The sizes below are a bit rough as we don't take into account + // advanced JVM options such as compressed oops + // however if our calculation is not accurate it'll be a bit over + // so there is no danger of an out of memory error because of this. + OBJ_HEADER = IS_64_BIT_JVM ? 16 : 8; + ARR_HEADER = IS_64_BIT_JVM ? 24 : 12; + OBJ_REF = IS_64_BIT_JVM ? 8 : 4; + OBJ_OVERHEAD = OBJ_HEADER + INT_FIELDS + OBJ_REF + ARR_HEADER; + + } + + /** + * Estimates the size of a {@link String} object in bytes. + * + * @param s The string to estimate memory footprint. + * @return The estimated size in bytes. + */ + public static long estimatedSizeOf(String s) { + return (s.length() * 2) + OBJ_OVERHEAD; + } + +} diff --git a/basic/storage_model/BAT.java b/basic/storage_model/BAT.java new file mode 100644 index 0000000..6bed05a --- /dev/null +++ b/basic/storage_model/BAT.java @@ -0,0 +1,32 @@ +package basic.storage_model; +/** + * Created by fyu on 11/1/16. + */ +import basic.util.DataRetriever; + +import java.io.*; + +public class BAT { + public static final String bat_format = "%10d,%10d\n"; + + public static void searchSelectFile(String bat_file_name, String select_file_name, long num_lines_body, int bat_line_length) throws IOException{ + BufferedReader select_file=new BufferedReader(new FileReader(select_file_name)); + String str; + long target_oid; + long offset; + long value; + while((str=select_file.readLine())!=null && str.length()!=0) { + target_oid = Long.parseLong(str); + value=selectBAT(bat_file_name, num_lines_body, bat_line_length, target_oid); + } + select_file.close(); + } + + public static long selectBAT(String file_name,long num_lines, int line_length,long target_oid) throws IOException{ + int oid_position=0; + RandomAccessFile file=new RandomAccessFile(new File(file_name), "r"); + long value= DataRetriever.binarySearchValue(file, num_lines, line_length, oid_position, target_oid); + file.close(); + return value; + } +} diff --git a/basic/storage_model/BUN.java b/basic/storage_model/BUN.java new file mode 100644 index 0000000..2bb787f --- /dev/null +++ b/basic/storage_model/BUN.java @@ -0,0 +1,13 @@ +package basic.storage_model; + +public class BUN { + public int oid; + public T value; + public BUN(int oid, T value){ + this.oid=oid; + this.value=value; + } + public String toString(){ + return "("+oid+","+value+")"; + } +} diff --git a/basic/storage_model/BUNL.java b/basic/storage_model/BUNL.java new file mode 100644 index 0000000..aeacab1 --- /dev/null +++ b/basic/storage_model/BUNL.java @@ -0,0 +1,16 @@ +package basic.storage_model; + +/** + * use long for oid + */ +public class BUNL { + public long oid; + public T value; + public BUNL(long oid, T value){ + this.oid=oid; + this.value=value; + } + public String toString(){ + return "("+oid+","+value+")"; + } +} diff --git a/basic/storage_model/TBAT.java b/basic/storage_model/TBAT.java new file mode 100644 index 0000000..00fe5f4 --- /dev/null +++ b/basic/storage_model/TBAT.java @@ -0,0 +1,207 @@ +package basic.storage_model; + +import basic.util.DataRetriever; +import basic.btree.OBTree; + + +import java.io.*; +import java.util.ArrayList; +import static java.lang.System.out; + +/** + * Created by fyu on 11/1/16. + */ +public class TBAT { + public static final String tbat_format = "%s,%10d,%10d\n"; + + /** + * search in appendix by offset + * file is the updated tbat file + * offset must start from 1!!! + */ + public static long searchAppendixByOffSet(RandomAccessFile file, long num_lines_body, int line_length,long offset, int value_position) throws IOException { + long value= DataRetriever.NO_VALUE; + file.seek((offset + num_lines_body - 1) * line_length); + String line = file.readLine(); + if(line!=null) { + //out.println("searchAppendixByOffSet:"+line); + value = Long.parseLong(line.split(",")[value_position].trim()); + } + file.seek(0); + return value; + } + + + /** + * same method + * save file handler open time + */ + public static long selectTBAT_body(RandomAccessFile file, long num_lines_body, int line_length, long target_oid) throws IOException{ + long value; + int oid_position=1; + value= DataRetriever.binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); + return value; + } + + + /** + * fyu + * search only the body of a TBAT using binary search, regardless of the appendix + * used for searching in combination with btree (which stores data in the appendix) + */ + public static long selectTBAT_body(String file_name, long num_lines_body, int line_length, long target_oid) throws IOException{ + long value; + int oid_position=1; + RandomAccessFile file=new RandomAccessFile(new File(file_name), "r"); + value= DataRetriever.binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); + file.close(); + return value; + } + + /** + * use select file + */ + public static void selectTBAT_body(String tbat_file_name, String select_file_name, long num_lines_body, int line_length) throws IOException{ + BufferedReader select_file=new BufferedReader(new FileReader(select_file_name)); + String str; + long target_oid; + long offset; + long value; + while((str=select_file.readLine())!=null && str.length()!=0) { + target_oid = Long.parseLong(str); + value=selectTBAT_body(tbat_file_name,num_lines_body,line_length,target_oid); + } + select_file.close(); + } + + public static void selectTBAT_Uncleaned(String tbat_file_name, String select_file_name, long num_lines_body, int line_length) throws IOException{ + BufferedReader select_file=new BufferedReader(new FileReader(select_file_name)); + String str; + long target_oid; + long offset; + long value; + while((str=select_file.readLine())!=null && str.length()!=0) { + target_oid = Long.parseLong(str); + value=selectTBAT_Uncleaned(tbat_file_name, num_lines_body, line_length, target_oid); + } + select_file.close(); + } + + + public static long selectTBAT_Uncleaned(String tbat_file_name, long num_lines_body, int line_length, long target_oid) throws IOException{ + long value=0; + int oid_position=1; + BufferedReader append_reader=new BufferedReader(new FileReader(tbat_file_name)); + value=searchAppendedFile(append_reader, num_lines_body, line_length, oid_position, target_oid); + append_reader.close(); + if(value== DataRetriever.NOT_FOUND){ + RandomAccessFile file=new RandomAccessFile(new File(tbat_file_name), "r"); + value= DataRetriever.binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); + file.close(); + } + return value; + } + + public static long selectTBAT_Uncleaned2(String file_name, int num_lines_body, int line_length, int target_oid) throws IOException{ + long value=0; + int oid_position=1; + RandomAccessFile file=new RandomAccessFile(new File(file_name), "r"); + value= DataRetriever.binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); + file.close(); + + return value; + } + + + /** + * @param oid_position the position of the oid (for tbat =1, for bat=0) + */ + public static long searchAppendedFile(BufferedReader append_reader, long num_lines_body, int line_length, + int oid_position, long target_oid) throws IOException{ + + append_reader.skip((num_lines_body)*line_length); + // skip the body of the updated tbat file, only read the appended part at the end + String current_line; + int temp_oid; + long temp_value; + long value= DataRetriever.NOT_FOUND; + + while((current_line=append_reader.readLine())!=null){ + temp_oid=Integer.parseInt(current_line.split(",")[oid_position].trim()); + if(temp_oid==target_oid){ + value=Integer.parseInt(current_line.split(",")[oid_position+1].trim()); + } + } + return value; + } + + /** + * + */ + public static void searchWithOBTree(OBTree obtree, String tbat_file_name, String select_file_name, long num_lines_body, int tbat_line_length) throws IOException{ + BufferedReader select_file=new BufferedReader(new FileReader(select_file_name)); + RandomAccessFile tbat_file=new RandomAccessFile(new File(tbat_file_name), "r");//open + String str; + long offset; + long target_oid; + long value; + while((str=select_file.readLine())!=null && str.length()!=0) { + target_oid = Long.parseLong(str); + offset=obtree.searchKey(target_oid); + if(offset!=DataRetriever.NOT_FOUND){ + value= TBAT.searchAppendixByOffSet(tbat_file, num_lines_body, tbat_line_length, offset, 2);//in a tbat, value is at 2 (3rd position in one line) + }else{ + value= TBAT.selectTBAT_body(tbat_file_name, num_lines_body, tbat_line_length, target_oid); + } + } + tbat_file.close(); + select_file.close(); + } + + + /*method for Eric Jones Thesis--begin*/ + /** + * select the value of the target oid + * given a tbat file and a list of split appendix files + */ + public static long selectTBAT_Uncleaned_Split(String tbat_file_name, + ArrayList appendix_file_names, + int num_lines_body, int line_length, int target_oid) + throws IOException{ + long value= DataRetriever.NOT_FOUND; + int oid_position=1; + if(!appendix_file_names.isEmpty()){ + for(String appendix_file_name:appendix_file_names){ + BufferedReader append_reader=new BufferedReader(new FileReader(appendix_file_name)); + //no line needs to be skipped in split appendix files + value=searchAppendedFile(append_reader, 0, line_length, oid_position, target_oid); + append_reader.close(); + if(value!= DataRetriever.NOT_FOUND) return value; + } + } + + RandomAccessFile file=new RandomAccessFile(new File(tbat_file_name), "r"); + value= DataRetriever.binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); + file.close(); + + return value; + } + + public static long selectTBAT_Uncleaned_Split2(ArrayList appendix_file_names, + int num_lines_body, int line_length, int target_oid) throws IOException{ + long value = DataRetriever.NOT_FOUND; + //int oid_position=1; + if(!appendix_file_names.isEmpty()){ + for(String appendix_file_name:appendix_file_names){ + RandomAccessFile appendix_file = new RandomAccessFile(appendix_file_name, "r"); + value = searchAppendixByOffSet(appendix_file, 0, line_length, + target_oid, 1); + if(value!= DataRetriever.NOT_FOUND) return value; + + } + } + return value; + } + /*Method by Eric Jones for Thesis--end*/ + +} diff --git a/basic/storage_model/TBUN.java b/basic/storage_model/TBUN.java new file mode 100644 index 0000000..fb1f1df --- /dev/null +++ b/basic/storage_model/TBUN.java @@ -0,0 +1,30 @@ +package basic.storage_model; + + + +public class TBUN extends BUN implements Comparable { + public long timestamp; + public static final String tbat_format = "%s,%10d,%10d"; + public TBUN(long timestamp, int oid, T value) { + super(oid, value); + this.timestamp=timestamp; + } + + public String toString(){ + String timestampstr=String.format("%d", timestamp); + if(timestampstr.length()>=8){ + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + } + return String.format(tbat_format, timestampstr, oid, value); + } + + public int compareTo(TBUN tbun2){ + int diff_oid=oid-tbun2.oid; + if(diff_oid!=0){ + return diff_oid; + }else{ + return (int)(timestamp-tbun2.timestamp); + } + } + +} diff --git a/basic/storage_model/TBUNL.java b/basic/storage_model/TBUNL.java new file mode 100644 index 0000000..7e316b8 --- /dev/null +++ b/basic/storage_model/TBUNL.java @@ -0,0 +1,35 @@ +package basic.storage_model; + +/** + * same as TBUN but use Long for oid + */ + +public class TBUNLextends BUNL implements Comparable { + public long timestamp; + public static final String tbat_format = "%s,%10d,%10d"; + public TBUNL(long timestamp, long oid, T value) { + super(oid, value); + this.timestamp=timestamp; + } + + public String toString(){ + String timestampstr=String.format("%d", timestamp); + if(timestampstr.length()>=8){ + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + } + return String.format(tbat_format, timestampstr, oid, value); + } + + /** + * it's only for sorting, no need to get the actual comparing difference + */ + public int compareTo(TBUNL tbun2){ + long diff_oid=oid-tbun2.oid; + if(diff_oid!=0){ + return (int)diff_oid; + }else{ + return (int)(timestamp-tbun2.timestamp); + } + } + +} diff --git a/basic/util/BasicTools.java b/basic/util/BasicTools.java new file mode 100644 index 0000000..286d09b --- /dev/null +++ b/basic/util/BasicTools.java @@ -0,0 +1,80 @@ +package basic.util; +import java.io.*; +import java.lang.instrument.Instrumentation; + +public class BasicTools { + public static void copyFile(String file_in_name, String file_out_name) throws IOException{ + BufferedReader br = new BufferedReader(new FileReader(file_in_name)); + BufferedWriter bw = new BufferedWriter(new FileWriter(file_out_name)); + int i; + do { + i = br.read(); + if (i != -1) { + if (Character.isLowerCase((char) i)) + bw.write(Character.toUpperCase((char) i)); + else if (Character.isUpperCase((char) i)) + bw.write(Character.toLowerCase((char) i)); + else + bw.write((char) i); + } + } while (i != -1); + br.close(); + bw.close(); + } + + private static Instrumentation instrumentation; + + public static void premain(String args, Instrumentation inst) { + instrumentation = inst; + } + + public static long getObjectSize(Object o) { + return instrumentation.getObjectSize(o); + } + + + /** + * reference:https://www.cs.cmu.edu/~adamchik/15-121/lectures/Sorting%20Algorithms/code/MergeSort.java + */ + public static void mergeSort(Comparable [ ] a) + { + Comparable[] tmp = new Comparable[a.length]; + mergeSort(a, tmp, 0, a.length - 1); + } + + private static void mergeSort(Comparable [ ] a, Comparable [ ] tmp, int left, int right) + { + if( left < right ) + { + int center = (left + right) / 2; + mergeSort(a, tmp, left, center); + mergeSort(a, tmp, center + 1, right); + merge(a, tmp, left, center + 1, right); + } + } + + private static void merge(Comparable[ ] a, Comparable[ ] tmp, int left, int right, int rightEnd ) + { + int leftEnd = right - 1; + int k = left; + int num = rightEnd - left + 1; + + while(left <= leftEnd && right <= rightEnd) + if(a[left].compareTo(a[right]) <= 0) + tmp[k++] = a[left++]; + else + tmp[k++] = a[right++]; + + while(left <= leftEnd) // Copy rest of first half + tmp[k++] = a[left++]; + + while(right <= rightEnd) // Copy rest of right half + tmp[k++] = a[right++]; + + // Copy tmp back + for(int i = 0; i < num; i++, rightEnd--) + a[rightEnd] = tmp[rightEnd]; + } + + +} diff --git a/basic/util/DataCreator.java b/basic/util/DataCreator.java new file mode 100644 index 0000000..f394ac8 --- /dev/null +++ b/basic/util/DataCreator.java @@ -0,0 +1,399 @@ +package basic.util; + +import basic.storage_model.BAT; +import basic.storage_model.TBAT; + +import java.io.*; +import java.util.*; + + + +public class DataCreator { + + public static void prepareData(long num_lines, String bat_file_name, + String tbat_file_name) throws IOException{ + + PrintWriter bat_file= new PrintWriter(new FileWriter(bat_file_name)); + PrintWriter tbat_file = new PrintWriter(new FileWriter(tbat_file_name)); + + String bat_str=""; + String tbat_str=""; + String timestampstr=""; + + timestampstr=String.format("%d", System.currentTimeMillis()); + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + + for(long i=0;i update_list=makeUpdateList(per, num_lines); + for(Integer current_line : update_list){ + update_file.format(BAT.bat_format, (int)current_line, -1); + } + update_file.close(); + } + + /** + * update value is same as line number + */ + public static void prepareUpdateList3(double per, long num_lines, + String update_file_name) throws IOException { + + PrintWriter update_file=new PrintWriter(new FileWriter(update_file_name)); + List update_list=makeUpdateList(per, num_lines); + for(Long current_line : update_list){ + update_file.format(BAT.bat_format, (long)current_line, (long)current_line); + } + update_file.close(); + } + + /** + * version 4 creates TBAT.tbat_formate update list + * allow duplicated update values + * the update value increases from 1 + */ + public static void prepareUpdateList4(double per, int num_lines, + String update_file_name) throws IOException { + PrintWriter update_file=new PrintWriter(new FileWriter(update_file_name)); + int update_num_lines=(int)(per*num_lines); + int total_updated=0; + int current_line=1; + int update_value=1; + String timestampstr; + Random rand=new Random(); + while(total_updated < update_num_lines){ + timestampstr=String.format("%d", System.currentTimeMillis()); + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + current_line=rand.nextInt(num_lines)+1; + update_file.format(TBAT.tbat_format, timestampstr, current_line, update_value++); + total_updated++; + } + update_file.close(); + } + + /** + * version 4.1 + * num_lines use long + */ + public static void prepareUpdateList41(double per, long num_lines, + String update_file_name) throws IOException { + PrintWriter update_file=new PrintWriter(new FileWriter(update_file_name)); + long update_num_lines=(long)(per*num_lines); + long total_updated=0; + long current_line=1; + long update_value=1; +// String timestampstr; + Random rand=new Random(); + while(total_updated < update_num_lines){ +// timestampstr=String.format("%d", System.currentTimeMillis()); +// timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + current_line=(long)(rand.nextDouble()*num_lines)+1L; + update_file.format(BAT.bat_format, current_line, update_value++); + total_updated++; + } + update_file.close(); + } + + /** + * doesn't need to shuffle a list randomly + * we just need to make a file + * for oid =1 to num_lines + * each time there is a random double probability p generated + * if p<=per + * then write this oid with the update value++ + */ + public static void prepareUpdateList5(double per, long num_lines, + String update_file_name) throws IOException { + PrintWriter update_file=new PrintWriter(new FileWriter(update_file_name)); + long update_num_lines=(long)(per*num_lines); + long total_updated=0; + long current_line=1; + long update_value=1; +// String timestampstr; + Random rand=new Random(); + double p;//random probability + while(total_updated < update_num_lines){ + p=rand.nextDouble(); + if(p<=per){ + update_file.format(BAT.bat_format, current_line, update_value++); + total_updated++; + } + if(current_line++>=num_lines) + current_line=1; + } + update_file.close(); + } + + static List makeList(int begin, int end){ + List list=new ArrayList(end-begin+1); + for (int i=begin;i<=end;i++){ + list.add(i); + } + return list; + } + + static List makeListLong(long begin, long end){ + List list=new ArrayList(); + for (long i=begin;i<=end;i++){ + list.add(i); + } + return list; + } + + /** + * the update list doesn't need to be sorted 2014-10-02 + */ + public static List makeUpdateList(double per, int num_lines){ + List list=makeList(1,num_lines); + Collections.shuffle(list); + int update_num_lines=(int)(per*num_lines); + List update_list_sorted=list.subList(0, update_num_lines); +// Collections.sort(update_list_sorted); + return update_list_sorted; + } + + /** + * the update list doesn't need to be sorted 2014-10-02 + */ + public static List makeUpdateList(double per, long num_lines){ + List list=makeListLong(1,num_lines); + Collections.shuffle(list); + long update_num_lines=(long)(per*num_lines); + List update_list_sorted=list.subList(0, (int)update_num_lines); +// Collections.sort(update_list_sorted); + return update_list_sorted; + } + + /** + * works only for a small selection file + */ + public static void prepareSelectionFile(String output_file_name, double sel_per, long num_lines) throws IOException{ + PrintWriter output_file=new PrintWriter(new BufferedWriter(new FileWriter(output_file_name))); + List list=DataCreator.makeUpdateList(sel_per, num_lines); + for(long oid:list){ + output_file.println(oid+""); + } + output_file.close(); + } + + /** + * produce selection of large size + * same as prepareUpdateList5 use a probability menor + */ + public static void prepareSelectionFile5(String output_file_name, double sel_per, long num_lines) throws IOException{ + PrintWriter output_file=new PrintWriter(new BufferedWriter(new FileWriter(output_file_name))); +// List list=DataCreator.makeUpdateList(sel_per, num_lines); +// for(long oid:list){ +// output_file.println(oid+""); +// } + long sel_num_lines=(long)(sel_per*num_lines);//total selection number + long total_selected=0; + long current_line=1; + Random rand=new Random(); + double p;//random probability + while(total_selected < sel_num_lines){ + p=rand.nextDouble(); + if(p<=sel_per){ + output_file.format("%d\n",current_line); + total_selected++; + } + if(current_line++>=num_lines) + current_line=1; + } + output_file.close(); + } + + public static List loadSelectionFile(String input_file_name) throws IOException{ + BufferedReader file_in=new BufferedReader(new FileReader(input_file_name)); + List list=new ArrayList(); + String line=""; + while((line=file_in.readLine())!=null){ + list.add(Integer.parseInt(line.trim())); + } + file_in.close(); + return list; + } + + + /** + * + * @param update_file_name + * @param appendix_file_prefix + * @param appendix_block_size + * @return number of appendix files returned + * @throws IOException + */ + public static int creaetTBATAppendix(String update_file_name, String appendix_file_prefix, int appendix_block_size) throws IOException{ + if(appendix_block_size==0){ + throw new IOException("appendix_block_size is zero!"); + } + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name)); + ArrayList update_lines=new ArrayList();//buffer of update file + + //read update file to buffer + String line=""; + ArrayList split_buffer=new ArrayList();//buffer of to split the update file buffer + int appendix_file_index=1; + int split_buffer_count=0; + String timestampstr=""; + long current_time_mills=System.currentTimeMillis(); + + while((line = update_file_in.readLine()) != null){ + split_buffer.add(line); + if(++split_buffer_count % appendix_block_size == 0){ + timestampstr=String.format("%d", current_time_mills++); + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + saveStringBufferToFile(appendix_file_prefix+"_"+(appendix_file_index++)+".txt",split_buffer,timestampstr); + split_buffer.clear(); + } + } + + //dump the rest of update file + if(!split_buffer.isEmpty()){ + timestampstr=String.format("%d", current_time_mills++); + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + saveStringBufferToFile(appendix_file_prefix+"_"+(appendix_file_index++)+".txt",split_buffer,timestampstr); + } + update_file_in.close(); + return appendix_file_index--; + } + + public static void saveStringBufferToFile(String output_file_name, ArrayList buffer, String timestampstr) throws IOException{ + PrintWriter output_file=new PrintWriter(new BufferedWriter(new FileWriter(output_file_name))); + for(String line:buffer){ + output_file.println(timestampstr+","+line); + } + output_file.close(); + } + + +// /** +// * devide the appendix files into a given number of split files +// */ +// public static void creaetTBATAppendix2(String update_file_name, String appendix_file_prefix, +// int appendix_num) throws IOException{ +// if(appendix_num==0){ +// throw new IOException("appendix_num is zero!"); +// } +// +// BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name)); +// ArrayList update_lines=new ArrayList();//buffer of update file +// +// +// //read update file to buffer +// String line=""; +// ArrayList split_buffer=new ArrayList();//buffer of to split the update file buffer +// int appendix_file_index=1; +// int split_buffer_count=0; +// String timestampstr=""; +// long current_time_mills=System.currentTimeMillis(); +// +// while((line = update_file_in.readLine()) != null){ +// split_buffer.add(line); +// if(++split_buffer_count % appendix_block_size == 0){ +// timestampstr=String.format("%d", current_time_mills++); +// timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); +// saveStringBufferToFile(appendix_file_prefix+"_"+(appendix_file_index++)+".txt",split_buffer,timestampstr); +// split_buffer.clear(); +// } +// } +// +// //dump the rest of update file +// if(!split_buffer.isEmpty()){ +// timestampstr=String.format("%d", current_time_mills++); +// timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); +// saveStringBufferToFile(appendix_file_prefix+"_"+(appendix_file_index++)+".txt",split_buffer,timestampstr); +// } +// update_file_in.close(); +// } + + +} + diff --git a/basic/util/DataRetriever.java b/basic/util/DataRetriever.java new file mode 100644 index 0000000..8f32d49 --- /dev/null +++ b/basic/util/DataRetriever.java @@ -0,0 +1,115 @@ +package basic.util; + +import javax.xml.crypto.Data; +import java.io.*; +import java.util.ArrayList; +import static java.lang.System.out; + +public class DataRetriever { +// public static int DEFAULT_BUFFER_SIZE=8192; + + /** + * + * select the value of the target oid + * given a tbat with appendix appended at the end of the tbat file + * + * @param file_name + * @param num_lines_body + * @param line_length + * @param oid_position the position of the oid (for tbat =1, for bat=1) + * @param target_oid + * @return target_value + * @throws IOException + */ + + //public static final int NOT_FOUND=Integer.MIN_VALUE; + //public static final long NOT_FOUND=Long.MIN_VALUE; + public static final long NOT_FOUND=-1; + public static final long NO_VALUE=-9999; + + + /** + * binary search for oid in the body of tbat (not the appended part) + * @param oid_position the position of the oid (for tbat =1, for bat=1) + */ + public static long binarySearchValue(RandomAccessFile file, long num_lines_body, int line_length, + int oid_position, long target_oid) throws IOException{ + long low=0; + long high=num_lines_body-1; + long mid, oid_mid; + String bat_current_line; + + while(low<=high){ + mid=(low+high)/2; + file.seek(mid*line_length); + bat_current_line=file.readLine(); + oid_mid=Long.parseLong(bat_current_line.split(",")[oid_position].trim()); + if(oid_mid == target_oid){ + //file.seek(0);//reset file pointer after updating + //System.out.println("found at: "+oid_mid); + return oid_mid; + }else if(oid_mid < target_oid) low=mid+1; + else high=mid-1; + } + System.out.println("Not found"); + return Integer.MIN_VALUE; + } + + + /** + * searchKey the length of one line in a file + */ + public static int getLineLength(String file_name) throws IOException { + RandomAccessFile randomReader=new RandomAccessFile(new File(file_name),"r"); + String first_line=randomReader.readLine(); + randomReader.close(); + int line_length=first_line.length()+1;//include '\n' + return line_length; + } + + /** + * searchKey the total line numbers in a file + * reference: http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java + */ + public static int getFileLineNumber(String file_name) throws IOException { + InputStream is = new BufferedInputStream(new FileInputStream(file_name)); + try { + byte[] c = new byte[1024]; + int count = 0; + int readChars = 0; + boolean endsWithoutNewLine = false; + while ((readChars = is.read(c)) != -1) { + for (int i = 0; i < readChars; ++i) { + if (c[i] == '\n') + ++count; + } + endsWithoutNewLine = (c[readChars - 1] != '\n'); + } + if (endsWithoutNewLine) { + ++count; + } + return count; + } finally { + is.close(); + } + } + +} + + + +// /** +// * binary search TBAT body +// * this method don't need line_length +// */ +// public static long selectTBAT_body(String file_name,int num_lines_body, int target_oid) throws IOException{ +// long value=0; +// int oid_position=1; +// RandomAccessFile file=new RandomAccessFile(new File(file_name), "r"); +// String first_line=file.readLine(); +// file.seek(0); +// int line_length=first_line.length()+1;//include '\n' +// value=binarySearchValue(file, num_lines_body, line_length, oid_position, target_oid); +// file.close(); +// return value; +// } diff --git a/basic/util/DataUpdator.java b/basic/util/DataUpdator.java new file mode 100644 index 0000000..9d084f2 --- /dev/null +++ b/basic/util/DataUpdator.java @@ -0,0 +1,591 @@ +package basic.util; +import java.io.*; +import java.util.ArrayList; +import java.util.Collections; + +import basic.storage_model.*; +import basic.btree.Entry; +import basic.btree.OBTree; +public class DataUpdator { + + public static int DEFAULT_BUFFER_SIZE=8192; + + public static void updateTBAT(String tbat_file_name, + String update_file_name) throws IOException{ + updateTBAT(tbat_file_name, update_file_name, DEFAULT_BUFFER_SIZE); + } + + /** + * append update file to the end of the TBAT + */ + public static void updateTBAT(String tbat_file_name, + String update_file_name, int buffer_size) throws IOException{ + PrintWriter tbat_file_out = new PrintWriter(new FileWriter(tbat_file_name,true)); + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name), buffer_size); + String line=""; + String timestampstr=""; + timestampstr=String.format("%d", System.currentTimeMillis()); + timestampstr=timestampstr.substring(timestampstr.length()-8,timestampstr.length()); + while((line = update_file_in.readLine()) != null){ + tbat_file_out.println(timestampstr+","+line); + } + update_file_in.close(); + tbat_file_out.close(); + } + + public static void updateBAT1(String bat_file_name, + String update_file_name) throws IOException{ + RandomAccessFile bat_file = new RandomAccessFile(new File(bat_file_name), "rw"); + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name)); + int update_oid; + int bat_oid; + long current_pos=0; + String current_line=""; + String update_line=""; + //read in update file + while((update_line = update_file_in.readLine()) != null){ + String[] tokens=update_line.split(","); + update_oid=Integer.parseInt(tokens[0].trim()); + //update bat file according to update_oid + while((current_line=bat_file.readLine())!=null){ + String[] tokens_bat=current_line.split(","); + bat_oid=Integer.parseInt(tokens_bat[0].trim()); + if(bat_oid == update_oid){ + current_pos=bat_file.getFilePointer(); + bat_file.seek(current_pos-current_line.length()-1); + bat_file.writeBytes(update_line+"\n"); + bat_file.seek(0);//back to top of bat file + break; + } + } + } + update_file_in.close(); + bat_file.close(); + } + + /** + * faster than updateBAT1, v2 uses buffered reader to read bat_file, and use randomaccessfile only when writing + * after one line is updated, the buffered reader will seek(0) + * this version works in all cases, including the update list is not sorted according to oid + */ + public static void updateBAT2(String bat_file_name, + String update_file_name) throws IOException{ + updateBAT2(bat_file_name, update_file_name, DEFAULT_BUFFER_SIZE); + } + + /** + * default BufferedReader size is 8192 + * this version can change the buffered reader size + */ + public static void updateBAT2(String bat_file_name, + String update_file_name, int buffer_size) throws IOException{ + RandomAccessFile bat_file_writer = new RandomAccessFile(new File(bat_file_name), "rw"); + FileInputStream bat_file_in=new FileInputStream(bat_file_name); + BufferedReader bat_file_reader=new BufferedReader(new InputStreamReader(bat_file_in), buffer_size); + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name), buffer_size); + + int update_oid; + int bat_oid; + String current_line=""; + String update_line=""; + //read in update file + while((update_line = update_file_in.readLine()) != null){ + long current_line_num=1; + String[] tokens=update_line.split(","); + update_oid=Integer.parseInt(tokens[0].trim()); +// System.out.println("update oid:"+update_oid); + + //update bat file according to update_oid + current_line = bat_file_reader.readLine();//read the 1st line of bat file + while(current_line != null){ + String[] tokens_bat=current_line.split(","); + bat_oid=Integer.parseInt(tokens_bat[0].trim()); + + if(bat_oid == update_oid){ + bat_file_writer.seek((current_line_num-1)*(current_line.length()+1)); + bat_file_writer.writeBytes(update_line+"\n"); + + //reset buffered reader to the beginning of bat file + bat_file_in.getChannel().position(0); + bat_file_reader=new BufferedReader(new InputStreamReader(bat_file_in)); + current_line_num=1; + break; + } + current_line_num++; + current_line=bat_file_reader.readLine(); + } + } + update_file_in.close(); + bat_file_in.close(); + bat_file_reader.close(); + bat_file_writer.close(); + } + + /** + * faster than updateBAT2, no need to seek(0) in bat_file when one line is updated. + * this version only works when the update list file is sorted. + */ + public static void updateBAT3(String bat_file_name, + String update_file_name) throws IOException{ + RandomAccessFile bat_file_writer = new RandomAccessFile(new File(bat_file_name), "rw"); + FileInputStream bat_file_in=new FileInputStream(bat_file_name); + BufferedReader bat_file_reader=new BufferedReader(new InputStreamReader(bat_file_in)); + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name)); + int update_oid; + int bat_oid; + long current_pos=0; + long current_line_num=1; + String current_line=""; + String update_line=""; + //read in update file + while((update_line = update_file_in.readLine()) != null){ + String[] tokens=update_line.split(","); + update_oid=Integer.parseInt(tokens[0].trim()); + + //update bat file according to update_oid + current_line = bat_file_reader.readLine(); + while(current_line != null){ + String[] tokens_bat=current_line.split(","); + bat_oid=Integer.parseInt(tokens_bat[0].trim()); + + if(bat_oid == update_oid){ + bat_file_writer.seek((current_line_num-1)*(current_line.length()+1)); + bat_file_writer.writeBytes(update_line+"\n"); + current_line_num++; + + break; + } + current_line_num++; + current_line=bat_file_reader.readLine(); + } + } + update_file_in.close(); + bat_file_in.close(); + bat_file_reader.close(); + bat_file_writer.close(); + } + + /** + * update bat using binary search + * Assumption: the oids in the BAT file are sorted + */ + public static void updateBAT_BinarySearch(String bat_file_name, String update_file_name) throws IOException{ + RandomAccessFile bat_file = new RandomAccessFile(new File(bat_file_name), "rw"); + BufferedReader update_file_in =new BufferedReader(new FileReader(update_file_name)); + int line_length=bat_file.readLine().length()+1; + bat_file.seek(0); + String update_line=null; + while((update_line = update_file_in.readLine()) != null){ + binarySearchUpdateBAT(bat_file,line_length,update_line); + } + update_file_in.close(); + } + + /** + * binary search for one update_line according to oid + * @param bat_file + * @param update_line + * @throws IOException + */ + public static int binarySearchUpdateBAT(RandomAccessFile bat_file, int line_length, String update_line) throws IOException{ + int update_oid=Integer.parseInt(update_line.split(",")[0].trim()); +// System.out.println("update oid:"+update_oid); + + int low=0; + int high=(int)bat_file.length()/line_length-1; + int mid, bat_oid_mid; + String bat_current_line; + + while(low<=high){ + mid=(low+high)/2; + bat_file.seek(mid*line_length); + bat_current_line=bat_file.readLine(); +// System.out.println(bat_current_line); + bat_oid_mid=Integer.parseInt(bat_current_line.split(",")[0].trim()); + if(bat_oid_mid == update_oid){ + //update this line + bat_file.seek(mid*line_length); + bat_file.writeBytes(update_line+"\n"); + bat_file.seek(0);//reset file pointer after updating + return mid+1;//return this line number + }else if(bat_oid_mid < update_oid) low=mid+1; + else high=mid-1; + } + return -1; + } + + /** + * binary search tbat file according to target tbun + */ + public static int binarySearchUpdateTBAT(RandomAccessFile tbat_file, int line_length, TBUN tbun_target) throws IOException{ + int low=0; + int high=(int)tbat_file.length()/line_length-1; + int mid, tbat_oid_mid; + String tbat_current_line; + + while(low<=high){ + mid=(low+high)/2; + tbat_file.seek(mid*line_length); + tbat_current_line=tbat_file.readLine(); +// System.out.println(mid+":"+tbat_current_line); + tbat_oid_mid=Integer.parseInt(tbat_current_line.split(",")[1].trim());//tbat oid index is 1! + if(tbat_oid_mid == tbun_target.oid){ + //update this line + tbat_file.seek(mid*line_length); + tbat_file.writeBytes(tbun_target+"\n"); + tbat_file.seek(0);//reset file pointer after updating + return mid;//return this line number + }else if(tbat_oid_mid < tbun_target.oid) low=mid+1; + else high=mid-1; + } + System.out.println("Not found "+tbun_target+" !"); + return -1; + } + + /** + * binary search tbat file according to target tbun + * high is the end of the tbat file (in our exp tbat=body, appendix=update list file). + * @param low the starting row number to begin binary search, low starts at 0 + * @return line number where the tbun_target is found + */ + public static int binarySearchUpdateTBAT(RandomAccessFile tbat_file, int line_length, TBUN tbun_target, int low) throws IOException{ + int high=(int)tbat_file.length()/line_length; + int mid, tbat_oid_mid; + String tbat_current_line; + +// System.out.println("low: "+low); + + int count=0;//search round + while(low<=high){ + count++; + mid=(low+high)/2; + tbat_file.seek(mid*line_length); + tbat_current_line=tbat_file.readLine(); +// System.out.println(mid+":"+tbat_current_line); + tbat_oid_mid=Integer.parseInt(tbat_current_line.split(",")[1].trim());//tbat oid index is 1! + if(tbat_oid_mid == tbun_target.oid){ + //update this line + tbat_file.seek(mid*line_length); + tbat_file.writeBytes(tbun_target+"\n"); + tbat_file.seek(0);//reset file pointer after updating +// System.out.println("searching round: "+count); + return mid;//return this line number + }else if(tbat_oid_mid < tbun_target.oid) low=mid+1; + else high=mid-1; + } + System.out.println("Not found "+tbun_target+" !"); + return -1; + } + + /** + * binary search tbat file according to target tbun + * high is the end of the tbat file (in our exp tbat=body, appendix=update list file). + * @param low the starting row number to begin binary search, low starts at 0 + * @return line number where the tbun_target is found + */ + public static int binarySearchUpdateTBAT(RandomAccessFile tbat_file, int line_length, TBUN tbun_target, int low, DiskAccessCount countTotal) throws IOException{ + int high=(int)tbat_file.length()/line_length; + int mid, tbat_oid_mid; + String tbat_current_line; + +// System.out.println("low: "+low); + + int count=0;//search round + while(low<=high){ + count++; + mid=(low+high)/2; + tbat_file.seek(mid*line_length); + tbat_current_line=tbat_file.readLine(); +// System.out.println(mid+":"+tbat_current_line); + tbat_oid_mid=Integer.parseInt(tbat_current_line.split(",")[1].trim());//tbat oid index is 1! + if(tbat_oid_mid == tbun_target.oid){ + //update this line + tbat_file.seek(mid*line_length); + tbat_file.writeBytes(tbun_target+"\n"); + tbat_file.seek(0);//reset file pointer after updating + countTotal.disk_access_total+=count; + return mid;//return this line number + }else if(tbat_oid_mid < tbun_target.oid) low=mid+1; + else high=mid-1; + } + System.out.println("Not found "+tbun_target+" !"); + countTotal.disk_access_total+=count; + return 0; + } + + /** + * based on binarySearchUpdateTBAT + * use long numbers + */ + public static long binarySearchUpdateTBAT_Long(RandomAccessFile tbat_file, int line_length, TBUNL tbun_target, long low, DiskAccessCount countTotal) throws IOException{ + long high=(int)tbat_file.length()/line_length; + long mid, tbat_oid_mid; + String tbat_current_line; + +// System.out.println("low: "+low); + + long count=0;//search round + while(low<=high){ + count++; + mid=(low+high)/2; + tbat_file.seek(mid*line_length); + tbat_current_line=tbat_file.readLine(); +// System.out.println(mid+":"+tbat_current_line); + tbat_oid_mid=Long.parseLong(tbat_current_line.split(",")[1].trim());//tbat oid index is 1! + if(tbat_oid_mid == tbun_target.oid){ + //update this line + tbat_file.seek(mid*line_length); + tbat_file.writeBytes(tbun_target+"\n"); + tbat_file.seek(0);//reset file pointer after updating + countTotal.disk_access_total+=count; + return mid;//return this line number + }else if(tbat_oid_mid < tbun_target.oid) low=mid+1; + else high=mid-1; + } + System.out.println("Not found "+tbun_target+" !"); + countTotal.disk_access_total+=count; + return 0; + } + + /** + * @param oid_position the place of oid in the line for appendix =1, for normal update file =0 + * + */ + public static void sortMergeFileToTBAT(String tbat_file_name, String appendix_file_name, int oid_position) throws IOException{ + BufferedReader appendix_file_in =new BufferedReader(new FileReader(appendix_file_name)); + String line=""; + ArrayList buffer=new ArrayList(1000); + + System.out.println("load buffer"); + if(oid_position==1){//for appendix file + while((line=appendix_file_in.readLine())!=null){ + String[] tbun_fields=line.split(","); + long timestamp=Long.parseLong(tbun_fields[0].trim()); + int oid=Integer.parseInt(tbun_fields[1].trim()); + int value=Integer.parseInt(tbun_fields[2].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + }else{//for normal update file + long timestamp=System.currentTimeMillis(); + while((line=appendix_file_in.readLine())!=null){ + String[] tbun_fields=line.split(","); + int oid=Integer.parseInt(tbun_fields[0].trim()); + int value=Integer.parseInt(tbun_fields[1].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + } + + System.out.println("buffer size:"+buffer.size()); + + appendix_file_in.close(); + + System.out.println("sorting buffer"); + Collections.sort(buffer);//a modified merge sort + + System.out.println("binarySearchUpdateTBAT"); + RandomAccessFile tbat_file = new RandomAccessFile(new File(tbat_file_name), "rw"); + int line_length=tbat_file.readLine().length()+1; + tbat_file.seek(0); + for(TBUN tbun:buffer){ + DataUpdator.binarySearchUpdateTBAT(tbat_file,line_length,tbun); + } + + } + + /** + * In this version2, we load the lines of file into the memory first and then parse into TBUN ArrayList + * @param oid_position the place of oid in the line for appendix =1, for normal update file =0 + * + */ + public static void sortMergeFileToTBAT2(String tbat_file_name, String appendix_file_name, int oid_position) throws IOException{ + BufferedReader appendix_file_in =new BufferedReader(new FileReader(appendix_file_name)); + + ArrayList buffer=new ArrayList(1000); + ArrayList lines=new ArrayList(1000); + + String line_temp=""; + while((line_temp=appendix_file_in.readLine())!=null){ + lines.add(line_temp); + } + + if(oid_position==1){//for appendix file + for(String line:lines){ + String[] tbun_fields=line.split(","); + long timestamp=Long.parseLong(tbun_fields[0].trim()); + int oid=Integer.parseInt(tbun_fields[1].trim()); + int value=Integer.parseInt(tbun_fields[2].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + }else{//for normal update file + long timestamp=System.currentTimeMillis(); + for(String line:lines){ + String[] tbun_fields=line.split(","); + int oid=Integer.parseInt(tbun_fields[0].trim()); + int value=Integer.parseInt(tbun_fields[1].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + } + + appendix_file_in.close(); + Collections.sort(buffer);//a modified merge sort + RandomAccessFile tbat_file = new RandomAccessFile(new File(tbat_file_name), "rw"); + int line_length=tbat_file.readLine().length()+1; + tbat_file.seek(0); + for(TBUN tbun:buffer){ + DataUpdator.binarySearchUpdateTBAT(tbat_file,line_length,tbun); + } + + } + + /** + * version 3 is same as version 2, except return disk read count + * @param oid_position the place of oid in the line for appendix =1, for normal update file =0 + * + */ + public static long sortMergeFileToTBAT3(String tbat_file_name, String appendix_file_name, int oid_position) throws IOException{ + DiskAccessCount countTotal=new DiskAccessCount(); + BufferedReader appendix_file_in =new BufferedReader(new FileReader(appendix_file_name)); + ArrayList buffer=new ArrayList(1000); + ArrayList lines=new ArrayList(1000); + + String line_temp=""; + while((line_temp=appendix_file_in.readLine())!=null){ + lines.add(line_temp); + countTotal.disk_access_total++; + } + + if(oid_position==1){//for appendix file + for(String line:lines){ + String[] tbun_fields=line.split(","); + long timestamp=Long.parseLong(tbun_fields[0].trim()); + int oid=Integer.parseInt(tbun_fields[1].trim()); + int value=Integer.parseInt(tbun_fields[2].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + }else{//for normal update file + long timestamp=System.currentTimeMillis(); + for(String line:lines){ + String[] tbun_fields=line.split(","); + int oid=Integer.parseInt(tbun_fields[0].trim()); + int value=Integer.parseInt(tbun_fields[1].trim()); + buffer.add(new TBUN(timestamp,oid,value)); + } + } + + appendix_file_in.close(); + Collections.sort(buffer);//a modified merge sort + RandomAccessFile tbat_file = new RandomAccessFile(new File(tbat_file_name), "rw"); + int line_length=tbat_file.readLine().length()+1; + tbat_file.seek(0); + for(TBUN tbun:buffer){ + DataUpdator.binarySearchUpdateTBAT(tbat_file,line_length,tbun, 0, countTotal); + } + return countTotal.disk_access_total; + } + + /** + * version 4 is based on version 3 + * version 4 aims to reduce the temporary memory needed + * + */ + public static long sortMergeFileToTBAT4(String tbat_file_name, String appendix_file_name, int oid_position) throws IOException{ + DiskAccessCount countTotal=new DiskAccessCount(); + BufferedReader appendix_file_in =new BufferedReader(new FileReader(appendix_file_name)); + ArrayList buffer=new ArrayList(1000); + long current_time_stamp=System.currentTimeMillis(); + String line=""; + while((line=appendix_file_in.readLine())!=null){ + String[] tbun_fields=line.split(","); + int oid=Integer.parseInt(tbun_fields[oid_position].trim()); + int value=Integer.parseInt(tbun_fields[oid_position+1].trim()); + long timestamp; + if(oid_position==1){//for apendix file + timestamp=Long.parseLong(tbun_fields[0].trim()); + }else{//for normal update file + timestamp=current_time_stamp; + } + buffer.add(new TBUN(timestamp,oid,value)); + countTotal.disk_access_total++; + } + appendix_file_in.close(); + Collections.sort(buffer);//a modified merge sort + RandomAccessFile tbat_file = new RandomAccessFile(new File(tbat_file_name), "rw"); + int line_length=tbat_file.readLine().length()+1; + tbat_file.seek(0); + for(TBUN tbun:buffer){ + DataUpdator.binarySearchUpdateTBAT(tbat_file,line_length,tbun, 0, countTotal); + } + tbat_file.close(); + return countTotal.disk_access_total; + } + + + /** + * use long numbers + * oid_position is small, int works + */ + public static long sortMergeFileToTBAT41(String tbat_file_name, String appendix_file_name, int oid_position, int buffer_size) throws IOException{ + DiskAccessCount countTotal=new DiskAccessCount(); + BufferedReader appendix_file_in =new BufferedReader(new FileReader(appendix_file_name)); + ArrayList buffer=new ArrayList(buffer_size); + long current_time_stamp=System.currentTimeMillis(); + String line=""; + while((line=appendix_file_in.readLine())!=null){ + String[] tbun_fields=line.split(","); + long oid=Long.parseLong(tbun_fields[oid_position].trim()); + long value=Long.parseLong(tbun_fields[oid_position+1].trim()); + long timestamp; + if(oid_position==1){//for apendix file + timestamp=Long.parseLong(tbun_fields[0].trim()); + }else{//for normal update file + timestamp=current_time_stamp; + } + buffer.add(new TBUNL(timestamp,oid,value)); + countTotal.disk_access_total++; + } + appendix_file_in.close(); + Collections.sort(buffer);//a modified merge sort + RandomAccessFile tbat_file = new RandomAccessFile(new File(tbat_file_name), "rw"); + int line_length=tbat_file.readLine().length()+1; + tbat_file.seek(0); + for(TBUNL tbun:buffer){ + DataUpdator.binarySearchUpdateTBAT_Long(tbat_file,line_length,tbun, 0, countTotal); + } + tbat_file.close(); + return countTotal.disk_access_total; + } + + /** + * same as sortMergeFileToTBAT41 but use default buffer size (10, as in java doc) + * https://docs.oracle.com/javase/8/docs/api/java/util/ArrayList.html + */ + public static long sortMergeFileToTBAT41(String tbat_file_name, String appendix_file_name, int oid_position) throws IOException{ + return sortMergeFileToTBAT41(tbat_file_name, appendix_file_name, oid_position, 10); + } + + /** + * merge appendix to body using OBTree + */ + public static long mergeAppendixToTBAT_OBTree (OBTree obtree, RandomAccessFile reader, + RandomAccessFile writer, int line_length) throws Exception{ + ArrayList> entry_list=obtree.getLeafEntryList(); + DiskAccessCount countTotal=new DiskAccessCount(); + int low=0;//low searching position when doing binary search + for (Entryentry: entry_list ){ + long line_num_update=entry.getValue(); + reader.seek((line_num_update-1)*line_length); + String line_updating=reader.readLine(); + countTotal.disk_access_total++;//one reading + String[] tokens=line_updating.split(","); + //!!! update file must be a tbat format file + Long timestamp=Long.parseLong(tokens[0].trim()); + Integer tbat_oid=Integer.parseInt(tokens[1].trim()); + Integer value=Integer.parseInt(tokens[2].trim()); + TBUN tbun_updating=new TBUN(timestamp.longValue(),tbat_oid.intValue(),value); + low=DataUpdator.binarySearchUpdateTBAT(writer, line_length, tbun_updating, low, countTotal); + } + return countTotal.disk_access_total; + } + + +} diff --git a/basic/util/DiskAccessCount.java b/basic/util/DiskAccessCount.java new file mode 100644 index 0000000..d3dea82 --- /dev/null +++ b/basic/util/DiskAccessCount.java @@ -0,0 +1,5 @@ +package basic.util; + +public class DiskAccessCount { + public long disk_access_total=0; +} diff --git a/basic/util/FileSplitter.java b/basic/util/FileSplitter.java new file mode 100644 index 0000000..52d7192 --- /dev/null +++ b/basic/util/FileSplitter.java @@ -0,0 +1,47 @@ +package basic.util; + +import java.io.*; + +public class FileSplitter { + + // total splitting partitions + public static int total_partition_num=3; + + public FileSplitter() { + + } + + public FileSplitter(int total_partition_num) { + this.total_partition_num=total_partition_num; + } + + public void setTotalPartitionNum(int total_partition_num) { + this.total_partition_num=total_partition_num; + } + + public static void splitByPartitionNum(String input_file_name, int total_partition_num) throws IOException{ + long file_line_num = DataRetriever.getFileLineNumber(input_file_name); + long chunk_line_num = (file_line_num -1)/ total_partition_num +1 ;//line number in each chunk. rounded up! +// System.out.println("chunk_line_num="+chunk_line_num); + BufferedReader input_file=new BufferedReader(new FileReader(input_file_name)); + long current_chunk_line_num=0;//line number in current chunk + int current_chunk_num=1; + PrintWriter output_file = new PrintWriter(new BufferedWriter(new FileWriter(input_file_name+"_"+current_chunk_num))); + String current_line; + while((current_line=input_file.readLine())!=null){ + if(current_chunk_line_num < chunk_line_num){ + output_file.println(current_line); + current_chunk_line_num++; + }else { + output_file.close(); + current_chunk_num++; + output_file = new PrintWriter(new BufferedWriter(new FileWriter(input_file_name+"_"+current_chunk_num))); + output_file.println(current_line); + current_chunk_line_num=1; + } + } + output_file.close(); + input_file.close(); + } + +} diff --git a/basic/util/MathTool.java b/basic/util/MathTool.java new file mode 100644 index 0000000..709ebfd --- /dev/null +++ b/basic/util/MathTool.java @@ -0,0 +1,76 @@ +package basic.util; + +import java.util.*; + +public class MathTool { + public static double mean(ArrayList list) { + double sum = 0; + for (double val : list) { + sum += val; + } + return sum / list.size(); + } + + + public static double median(ArrayList list) { + if(list.size()==0){ + System.out.println("list is empty for:"+list); + return Double.NEGATIVE_INFINITY; + } + if(list.size()==1){ + return list.get(0); + } + int middle=list.size()/2; + if (list.size()%2 == 1) { + return list.get(middle); + } else { + return (list.get(middle-1)+list.get(middle))/2.0; + } + } + + /** + * Returns the sample variance in the ArrayList a, NaN if no such value. + */ + public static double var(ArrayList a) { + if (a.size() == 0) return Double.NaN; + double avg = mean(a); + double sum = 0.0; + for (int i = 0; i < a.size(); i++) { + sum += (a.get(i) - avg) * (a.get(i) - avg); + } + return sum / (a.size() - 1); + } + + /** + * Returns the sample standard deviation in the ArrayList a, NaN if no such value. + */ + public static double stddev(ArrayList a) { + return Math.sqrt(var(a)); + } + + /** + * remove outlier + */ + + public static ArrayList removeOutlier(ArrayList a, double m){ + double u=mean(a); + double s=stddev(a); + ArrayList filtered=new ArrayList(); + for(Double e:a){ + if(e > u-m*s && e < u+m*s){ + filtered.add(e); + } + } + return filtered; + } + + private static final long MEGABYTE = 1024L * 1024L; + + public static double bytesToKB(long bytes) { + return bytes*1.0 / 1024L; + } + + public static double bytesToMB(long bytes) { + return bytes*1.0 / MEGABYTE; + } +} diff --git a/basic/util/Merger.java b/basic/util/Merger.java new file mode 100644 index 0000000..7a932f9 --- /dev/null +++ b/basic/util/Merger.java @@ -0,0 +1,20 @@ +package basic.util; + + +/** + * This is the sort merge class for data cleaning after AOC updates + * @author fyu + * + */ + +import java.io.*; +import java.util.*; + +public class Merger { + + public static void mergeWithBody(String body_file_name, String appendix, String output_file_name) throws IOException { + // read in an appendix + + + } +} diff --git a/build.xml b/build.xml new file mode 100644 index 0000000..76f5321 --- /dev/null +++ b/build.xml @@ -0,0 +1,91 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/exp_merge_ob/exp_merge_ob.java b/exp_merge_ob/exp_merge_ob.java new file mode 100644 index 0000000..ff89ac8 --- /dev/null +++ b/exp_merge_ob/exp_merge_ob.java @@ -0,0 +1,232 @@ +package exp_merge_ob; + +import static java.lang.System.out; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; + +import basic.btree.*; +import basic.util.*; + +public class exp_merge_ob { + + static int num_lines_1m=47660;//number of lines of 1MB BAT file + static int num_lines;//number of lines in the tbat and bat files + static int max_exp_times;//maximum iteration times of experiment + static ArrayList pers=new ArrayList();//update percentages + static int appendix_num_split=10; //number of split files for appendixes + static double sel_per=0.1;//selection percentage + static String result_dir= "data/exp_merge_ob/"; + static String data_dir="data/"; + static String tbat_file_name_original=data_dir+"tbat.txt"; + static String tbat_file_name_copy1=tbat_file_name_original.substring(0, tbat_file_name_original.length()-4)+"_cp1.txt"; + static String tbat_file_name_copy2=tbat_file_name_original.substring(0, tbat_file_name_original.length()-4)+"_cp2.txt"; + + public static void main(String[] args) throws Exception{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); +// final String result_file_name=result_dir+"result-merge-ob-"+ +// (new SimpleDateFormat("yyyyMMdd-HHmmss").format(Calendar.getInstance().getTime())); + final String result_file_name=result_dir+"result-merge-ob.txt"; + PrintWriter result_file = null; + if(args.length < 3){ + out.println("Please input num_lines " + + " max_exp_times per1 per2 per3 ... "); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + max_exp_times = Integer.parseInt(args[1]); + for(int i=2;i> all_times_merge_ob=new HashMap>(); + HashMap> all_times_merge_bi=new HashMap>(); + HashMap mean_times_merge_ob=new HashMap(); + HashMap mean_times_merge_bi=new HashMap(); + HashMap all_memories_ob=new HashMap();//ob-tree memory used + HashMap disk_access_ob=new HashMap();//ob-tree disk access + HashMap disk_access_bi=new HashMap();//binary search merge disk access + + //---do the experiment--- + for(double per:pers){ + + + out.println("exp: update "+per+"%"); + result_file.println("* exp: update "+per+"%"); + String update_file_name=data_dir+"update_"+num_lines+"_"+per+".txt"; + + ArrayList merge_bi_time_temp=new ArrayList(); + ArrayList merge_ob_time_temp=new ArrayList(); + + // bulk loading of update list file into OB-tree + // OBTree obtree = new OBTree(); + // OBTree changed to BTree + OBTree obtree = new OBTree(); + obtree.loadUpdateFile(update_file_name); + all_memories_ob.put(per,obtree.toKB()); + long disk_access_ob_temp=0; + long disk_access_bi_temp=0; + for(int i=0;i merge_bi_time_temp=all_times_merge_bi.searchKey(per); +// ArrayList merge_ob_time_temp=all_times_merge_ob.searchKey(per); +// for(int i=0;i merge_bi_time_temp=all_times_merge_bi.get(per); + for(int i=0;i merge_ob_time_temp=all_times_merge_ob.get(per); + for(int i=0;i pers=new ArrayList();//update percentages + static int appendix_num_split=10; //number of split files for appendixes + static double sel_per=0.1;//selection percentage + static String result_dir= "data/exp_merge_ob/"; + static String data_dir="data/"; + static String tbat_file_name_original=data_dir+"tbat.txt"; + static String tbat_file_name_copy1=tbat_file_name_original.substring(0, tbat_file_name_original.length()-4)+"_cp1.txt"; + static String tbat_file_name_copy2=tbat_file_name_original.substring(0, tbat_file_name_original.length()-4)+"_cp2.txt"; + + public static void main(String[] args) throws Exception{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + final String result_file_name=result_dir+"result-ob-memory.txt"; + PrintWriter result_file = null; + if(args.length < 3){ + out.println("OB-Tree Loading Memory Test: please input num_lines " + + " per1 per2 per3 ... "); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + for(int i=1;i> all_times_merge_ob=new HashMap>(); + HashMap> all_times_merge_bi=new HashMap>(); + HashMap mean_times_merge_ob=new HashMap(); + HashMap mean_times_merge_bi=new HashMap(); + HashMap all_memories_ob=new HashMap();//ob-tree memory used + + //---do the experiment--- + for(double per:pers){ + out.println("exp: update "+per+"%"); + result_file.println("* exp: update "+per+"%"); + String update_file_name=data_dir+"update_"+num_lines+"_"+per+".txt"; + + ArrayList merge_bi_time_temp=new ArrayList(); + ArrayList merge_ob_time_temp=new ArrayList(); + + // bulk loading of update list file into OB-tree + //OBTree changed to BTree + OBTree obtree = new OBTree(); + obtree.loadUpdateFile(update_file_name); + all_memories_ob.put(per,obtree.toKB()); + } + out.println("Major expriment finished!"); + out.println(); + result_file.println("\n#Analysis:\n"); + + //---------memory - OB-merge ------- + result_file.println("OB-Merge memory used:\n"); + result_file.format("%3s, %10s \n","perc","KB"); + for (double per : pers) { + result_file.format("%-3.2f, %10.3f \n", per, all_memories_ob.get(per)); + } + + //end of file + result_file.println(); + String program_end_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + result_file.println("Program Started at: "+program_start_date_time); + result_file.println("Program Ended at: "+program_end_date_time); + long end_global=System.currentTimeMillis(); + double elapsedTime=(end_global-start_global)/1000.0; + result_file.println("Elapsed Time:"+elapsedTime+"s\n"); + + result_file.close(); + out.println("Elapsed Time:"+elapsedTime+"s"); + } + + /** + * prepare files for merge progressive select experiment + * files include: + * - tbat and bat files + * - updat list file + * - appendix files for each update file (AOC appendix files) + * - selection oid file + * + */ + public static void prepareFiles() throws IOException { + //-----prepare tbat and bat files----- + DataCreator.prepareTBAT(num_lines, tbat_file_name_original); + out.println("TBAT file "+tbat_file_name_original+" created"); + //-----prepare update and appendix files p%=1%-5%----- + for(double per:pers){ + //create update files + String update_file_name=data_dir+"update_"+num_lines+"_"+per+".txt"; + DataCreator.prepareUpdateList1(per, num_lines, update_file_name, 1); + out.println("Update file: "+update_file_name+" created"); + } + + + //-----prepare selection query files----- +// DataCreator.prepareSelectionFile(select_file_name, sel_per, num_lines); +// System.out.println("Selection files created"); + } + + +} diff --git a/exp_merge_progressive/exp_merge_progressive_btree.java b/exp_merge_progressive/exp_merge_progressive_btree.java new file mode 100644 index 0000000..caba5d8 --- /dev/null +++ b/exp_merge_progressive/exp_merge_progressive_btree.java @@ -0,0 +1,282 @@ +package exp_merge_progressive; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; + +import basic.btree.*; +import basic.storage_model.TBAT; +import basic.util.*; + +public class exp_merge_progressive_btree { + + static int num_lines_1m=47660; + static int num_lines; +// static int num_lines=64*num_lines_1m; + static ArrayList pers=new ArrayList();//update percentages + static int appendix_num_split=10; //number of split files for appendixes + static double sel_per=0.1;//selection percentage + final static String dir_name= "results/exp_merge/"; + final static String bat_file_name=dir_name+"bat.txt"; + final static String tbat_file_name=dir_name+"tbat.txt"; + final static String tbat_temp_file_name=dir_name+"tbat_temp.txt"; + final static String select_file_name=dir_name+"select_"+sel_per+".txt"; + final static String result_file_name=dir_name+"results/result-merge-progressive-select.txt"; + + public static void main(String[] args) throws IOException{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + PrintWriter result_file= new PrintWriter(new FileWriter(result_file_name)); + + if(args.length<1){ + System.out.println("Input: num_lines\n"); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + System.out.println("Number of lines for experiment:"+num_lines); + result_file.println("Number of lines for experiment:"+num_lines+"\n"); + } + BTree appendixBTree = new BTree(); + int OFF=1; + String a; + String timestamp; + int b; + int valueOfA; + + + long start=System.currentTimeMillis(); + for(int p=1;p<=5;p++){ + pers.add(p*0.01); + } + + + + //---prepare files--- + prepareFiles(); + + //---do the experiment--- + int tbat_line_length= DataRetriever.getLineLength(tbat_file_name); + int bat_line_length = DataRetriever.getLineLength(bat_file_name); + List select_list= DataCreator.loadSelectionFile(select_file_name); + //for progressive approach + HashMap> all_times_select=new HashMap>(); + HashMap> all_times_merge=new HashMap>(); + HashMap all_memories=new HashMap(); + //for eacher approach +// HashMap all_times_select2=new HashMap(); + HashMap all_memories_eager=new HashMap(); + + for(double per:pers){ + System.out.println("exp: update "+per+"%"); + result_file.println("* exp: update "+per+"%"); + //-----progressive approach----- + BasicTools.copyFile(tbat_file_name, tbat_temp_file_name); + System.out.println("copied temp file"); + ArrayList times_select=new ArrayList(); + ArrayList times_merge=new ArrayList(); + ArrayList memories=new ArrayList(); + String appendix_file_prefix=dir_name+"appendix_"+per; + ArrayList appendix_file_names=new ArrayList(); + for(int i=1;i<=appendix_num_split;i++){ + appendix_file_names.add(appendix_file_prefix+"_"+i+".txt"); + Scanner reads = new Scanner(new File(dir_name + "update_" + per + ".txt")); + OFF = 1; + while (reads.hasNext()) { + a = reads.next(); // read OID + b = reads.nextInt(); // read VALUE + a = a.substring(0, a.length() - 1); // removing the comma that was auto-generated + valueOfA = Integer.parseInt(a); // placing that number into a variable + if (appendixBTree.get(valueOfA) != null) { + appendixBTree.findReplace(valueOfA, OFF); + } else { + appendixBTree.put(valueOfA, OFF); + }// end of if-else + OFF++; + } + //reads.close(); + + + } + for(int index=0;index<=appendix_num_split;index++){ + System.out.println("progressive sort merge tbat index:"+index); + if(index!=0){ + Runtime runtime = Runtime.getRuntime();//Get the Java runtime + long start_merge=System.currentTimeMillis(); + DataUpdator.sortMergeFileToTBAT2(tbat_temp_file_name, appendix_file_prefix+"_"+index+".txt", 1); + long end_merge=System.currentTimeMillis(); + double elapsed_time_merge=(end_merge-start_merge)/1000.0; + times_merge.add(elapsed_time_merge); + runtime.gc();//Run garbage collector + long memory = runtime.totalMemory() - runtime.freeMemory();//used memory + memories.add(MathTool.bytesToKB(memory)*1.0); + appendix_file_names.remove(0); + } + long target_value; + System.out.println("exp select TBAT uncleaned"); + long start2=System.currentTimeMillis(); + //System.out.println(appendixBTree); + try{ + RandomAccessFile updates = new RandomAccessFile(new File(dir_name + "update_" + per + ".txt"), "r"); + for(int target_oid:select_list){ + if(appendixBTree.get(target_oid) == null){ + target_value = TBAT.selectTBAT_Uncleaned2(tbat_file_name, num_lines, + tbat_line_length, target_oid); + }else{ + //target_value = DataRetriever.selectTBAT_Uncleaned_Split2(appendix_file_names, + //0, bat_line_length, target_oid); + //System.out.println(appendixBTree.searchKey(target_oid)); + target_value = TBAT.searchAppendixByOffSet(updates, 0, + bat_line_length, appendixBTree.get(target_oid), 1); + } + //System.out.printf("Target OID %d has value %d ", target_oid, target_value); + } + }catch(Exception ex){ + System.out.println("File Problems Again"); + } + long end2=System.currentTimeMillis(); + double elapsed_time2=(end2-start2)/1000.0; + times_select.add(elapsed_time2); + } + all_times_merge.put(per, times_merge); + all_times_select.put(per, times_select); + all_memories.put(per, MathTool.mean(memories)); + + + //-----eager approach----- + System.out.println("eager sort merge tbat"); + BasicTools.copyFile(tbat_file_name, tbat_temp_file_name); + Runtime runtime2 = Runtime.getRuntime();//Get the Java runtime + runtime2.gc();//Run garbage collector + DataUpdator.sortMergeFileToTBAT2(tbat_temp_file_name, dir_name+"update_"+per+".txt", 0); + long memory2 = runtime2.totalMemory() - runtime2.freeMemory();//used memory + all_memories_eager.put(per, MathTool.bytesToKB(memory2)*1.0); + } + System.out.println("Major expriment finished!"); + System.out.println(); + result_file.println("\n#Progressive:\n"); + + //-------merge time---------- + result_file.println("Merge Time:\n"); + //print table head + result_file.print("update_per\\merge_per"); + for(int index=1;index<=appendix_num_split;index++){ + result_file.print("|"+(int)(index*10)+"%"); + } + result_file.println(); + result_file.print("---"); + for(int index=0;index<=appendix_num_split;index++){ + result_file.print("|---"); + } + result_file.println(); + //print table body + for(double per:pers){ + result_file.print(per+""); + ArrayList times_merge=all_times_merge.get(per); + for(int i=0;i times_select=all_times_select.get(per); + for(int i=0;i pers=new ArrayList();//update percentages + static int appendix_num_split=10; //number of split files for appendixes + static double sel_per=0.1;//selection percentage + final static String dir_name= "results/exp_merge/"; + final static String bat_file_name=dir_name+"bat.txt"; + final static String tbat_file_name=dir_name+"tbat.txt"; + final static String tbat_temp_file_name=dir_name+"tbat_temp.txt"; + final static String select_file_name=dir_name+"select_"+sel_per+".txt"; + final static String result_file_name=dir_name+"results/result-merge-progressive-select.txt"; + + public static void main(String[] args) throws IOException{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + PrintWriter result_file= new PrintWriter(new FileWriter(result_file_name)); + + if(args.length<1){ + System.out.println("Input: num_lines\n"); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + System.out.println("Number of lines for experiment:"+num_lines); + result_file.println("Number of lines for experiment:"+num_lines+"\n"); + } + + long start=System.currentTimeMillis(); + for(int p=1;p<=5;p++){ + pers.add(p*0.01); + } + + //---prepare files--- + prepareFiles(); + + //---do the experiment--- + int tbat_line_length= DataRetriever.getLineLength(tbat_file_name); + List select_list= DataCreator.loadSelectionFile(select_file_name); + //for progressive approach + HashMap> all_times_select=new HashMap>(); + HashMap> all_times_merge=new HashMap>(); + HashMap all_memories=new HashMap(); + + //for eacher approach +// HashMap all_times_select2=new HashMap(); + HashMap all_memories_eager=new HashMap(); + + for(double per:pers){ + System.out.println("exp: update "+per+"%"); + result_file.println("* exp: update "+per+"%"); + + //-----progressive approach----- + BasicTools.copyFile(tbat_file_name, tbat_temp_file_name); + System.out.println("copy temp file"); + ArrayList times_select=new ArrayList(); + ArrayList times_merge=new ArrayList(); + ArrayList memories=new ArrayList(); + String appendix_file_prefix=dir_name+"appendix_"+per; + ArrayList appendix_file_names=new ArrayList(); + for(int i=1;i<=appendix_num_split;i++){ + appendix_file_names.add(appendix_file_prefix+"_"+i+".txt"); + } + for(int index=0;index<=appendix_num_split;index++){ + System.out.println("progressive sort merge tbat index:"+index); + if(index!=0){ + Runtime runtime = Runtime.getRuntime();//Get the Java runtime + long start_merge=System.currentTimeMillis(); + DataUpdator.sortMergeFileToTBAT2(tbat_temp_file_name, appendix_file_prefix+"_"+index+".txt", 1); + long end_merge=System.currentTimeMillis(); + double elapsed_time_merge=(end_merge-start_merge)/1000.0; + times_merge.add(elapsed_time_merge); + runtime.gc();//Run garbage collector + long memory = runtime.totalMemory() - runtime.freeMemory();//used memory + memories.add(MathTool.bytesToKB(memory)*1.0); + appendix_file_names.remove(0); + } + System.out.println("exp select TBAT uncleaned"); + long start2=System.currentTimeMillis(); + for(int target_oid:select_list){ + long target_value= TBAT.selectTBAT_Uncleaned_Split(tbat_file_name, + appendix_file_names, num_lines, tbat_line_length, target_oid); + } + long end2=System.currentTimeMillis(); + double elapsed_time2=(end2-start2)/1000.0; + times_select.add(elapsed_time2); + } + all_times_merge.put(per, times_merge); + all_times_select.put(per, times_select); + all_memories.put(per, MathTool.mean(memories)); + + + //-----eager approach----- + System.out.println("eager sort merge tbat"); + BasicTools.copyFile(tbat_file_name, tbat_temp_file_name); + Runtime runtime2 = Runtime.getRuntime();//Get the Java runtime + runtime2.gc();//Run garbage collector + DataUpdator.sortMergeFileToTBAT2(tbat_temp_file_name, dir_name+"update_"+per+".txt", 0); + long memory2 = runtime2.totalMemory() - runtime2.freeMemory();//used memory + all_memories_eager.put(per, MathTool.bytesToKB(memory2)*1.0); + } + System.out.println("Major expriment finished!"); + System.out.println(); + result_file.println("\n#Progressive:\n"); + + //-------merge time---------- + result_file.println("Merge Time:\n"); + //print table head + result_file.print("update_per\\merge_per"); + for(int index=1;index<=appendix_num_split;index++){ + result_file.print("|"+(int)(index*10)+"%"); + } + result_file.println(); + result_file.print("---"); + for(int index=0;index<=appendix_num_split;index++){ + result_file.print("|---"); + } + result_file.println(); + //print table body + for(double per:pers){ + result_file.print(per+""); + ArrayList times_merge=all_times_merge.get(per); + for(int i=0;i times_select=all_times_select.get(per); + for(int i=0;i pers_update=new ArrayList(); + ArrayList pers_select=new ArrayList(); + + //----------accept input arguments-------------- + if(args.length<=3){ + out.println("Please input num_lines " + + " max_exp_times -u update_per1 per2 per3 ... -s select_per1 per2 per3 ..."); + System.exit(0); + }else{ + num_lines_body = Integer.parseInt(args[0]); + max_exp_times = Integer.parseInt(args[1]); + + if(args[2].equalsIgnoreCase("-u")){ + int i=3; + while(!args[i].equalsIgnoreCase("-s")){ + pers_update.add(Double.parseDouble(args[i++])); + } + i++; + while(i bat_select_time_medians=new ArrayList(); + ArrayList tbat_select_time_medians=new ArrayList(); + ArrayList bat_select_time_means=new ArrayList(); + ArrayList tbat_select_time_means=new ArrayList(); + ArrayList bat_select_time_maxs=new ArrayList(); + ArrayList tbat_select_time_maxs=new ArrayList(); + ArrayList bat_select_time_mins=new ArrayList(); + ArrayList tbat_select_time_mins=new ArrayList(); + ArrayList overhead_medians=new ArrayList(); + ArrayList overhead_means=new ArrayList(); + + for (double per_select:pers_select){ + out.println("per_update:"+per_update+" | per_select:"+per_select); + + ArrayList bat_select_time_temp=new ArrayList(); + ArrayList tbat_select_time_temp=new ArrayList(); + + List select_list= DataCreator.makeUpdateList(per_select, num_lines_body); + for(int i=0;i pers=new ArrayList(); + + if(args.length<2){ + out.println("Please input: num_lines per1 per2 per3 ... "); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + for(int i=1;i pers_update=new ArrayList(); + ArrayList pers_select=new ArrayList(); + + //----------accept input arguments-------------- + if(args.length<=3){ + out.println("Please input num_lines " + + " max_exp_times -u update_per1 per2 per3 ... -s select_per1 per2 per3 ..."); + System.exit(0); + }else{ + num_lines_body = Long.parseLong(args[0]); + max_exp_times = Integer.parseInt(args[1]); + + if(args[2].equalsIgnoreCase("-u")){ + int i=3; + while(!args[i].equalsIgnoreCase("-s")){ + pers_update.add(Double.parseDouble(args[i++])); + } + i++; + while(i select_list + * 2. selection tests + * 2.1 obtree creation + * obtree creation: loadUpdateFile -> appendixTree(i.e. obtree) + * 2.2 obtree selection + * use select_list + * 2.3 tbat (uncleaned, without index) selection + * selectTBAT_uncleaned + * 2.4 bat (updated) selection + * selectBAT + */ + + for (double per_update:pers_update){ + out.println("per_update:"+per_update); + String tbat_file_name="data/tbat_"+per_update+".txt"; + String bat_file_name="data/bat_"+per_update+".txt"; + String update_file_name="data/update_"+per_update+".txt"; + + int tbat_line_length= DataRetriever.getLineLength(tbat_file_name); + int bat_line_length=DataRetriever.getLineLength(bat_file_name); + + for (double per_select:pers_select){ + out.println("per_update:"+per_update+" | per_select:"+per_select); + + ArrayList bat_select_time_temp=new ArrayList(); + ArrayList tbat_select_time_temp=new ArrayList(); + + for(int i=1;i<=max_exp_times;i++){ + out.println("\tloop:"+(i+1)); + //make selection list + List select_list= DataCreator.makeUpdateList(per_select, num_lines_body); + long value; + + + //---load update appendix into btree--- + //btree create start + long btree_c_start=System.currentTimeMillis(); + OBTree obtree = new OBTree(); + obtree.loadUpdateFile(update_file_name); + Double btree_c_time=(double)(System.currentTimeMillis()-btree_c_start)/1000.0d; + //OB-tree creation time + result_file.format(format_string1, "btree_c", per_update, per_select, i, btree_c_time); + + //---btree select start--- + RandomAccessFile tbat_file=new RandomAccessFile(new File(tbat_file_name), "r");//open + long offset; + long btree_s_start=System.currentTimeMillis(); + for(long target_oid:select_list){ + offset=obtree.searchKey(target_oid); + if(offset!=DataRetriever.NOT_FOUND){ + value= TBAT.searchAppendixByOffSet(tbat_file, num_lines_body, tbat_line_length, offset, 2);//in a tbat, value is at 2 (3rd position in one line) + }else{ + value= TBAT.selectTBAT_body(tbat_file_name, num_lines_body, tbat_line_length, target_oid); + } + } + tbat_file.close(); + Double btree_s_time=(double)(System.currentTimeMillis()-btree_s_start)/1000.0d; + //record TBAT selection time with OB-tree + result_file.format(format_string1, "btree_s", per_update, per_select, i, btree_s_time); + + + //---select tbat uncleaned (without OB-tree index)-- + long tbat_start=System.currentTimeMillis(); + for(long target_oid:select_list){ + value= TBAT.selectTBAT_Uncleaned(tbat_file_name, num_lines_body, tbat_line_length, target_oid); + } + Double tbat_time=(double)(System.currentTimeMillis()-tbat_start)/1000.0d; + //record TBAT selection time without OB-tree + result_file.format(format_string1, "tbat_s", per_update, per_select, i, tbat_time); + + //---select bat--- + long bat_start=System.currentTimeMillis(); + for(long target_oid:select_list){ + value= BAT.selectBAT(bat_file_name, num_lines_body, bat_line_length, target_oid); + } + Double bat_time=(double)(System.currentTimeMillis()-bat_start)/1000.0d; + //record BAT selection time + result_file.format(format_string1, "bat_s", per_update, per_select, i, bat_time); + }//end max_exp_time + }//end pers_select + result_file.println("\n"); + }//end pers_update + + //-------------summary and elapsed time calculation------ + long end=System.currentTimeMillis(); + double elapsedTime=(end-start)/1000.0; + out.println("%Elapsed Time:"+elapsedTime+"s"); + result_file.println("% Elapsed Time:"+elapsedTime+"s"); + String program_end_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + result_file.println("% Program Started at: "+program_start_date_time); + result_file.println("% Program Ended at: "+program_end_date_time); + result_file.close(); + }//---end of main--- + +} diff --git a/exp_select/obtree/prepare_files.java b/exp_select/obtree/prepare_files.java new file mode 100644 index 0000000..0115d5e --- /dev/null +++ b/exp_select/obtree/prepare_files.java @@ -0,0 +1,67 @@ +package exp_select.obtree; + +import basic.util.DataCreator; +import basic.util.DataUpdator; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; + +import static java.lang.System.out; + +public class prepare_files { + + public static void main(String[] args) throws IOException { + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + long start=System.currentTimeMillis(); +// PrintWriter result_file= new PrintWriter(new FileWriter("data/result-selection.txt")); + + int num_lines = 0, max_exp_times=0; + ArrayList pers=new ArrayList(); + + if(args.length<2){ + out.println("Please input: num_lines per1 per2 per3 ... "); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + for(int i=1;i pers_update=new ArrayList(); + + if(args.length<3){ + out.println("% Please input: num_lines " + + "-u update_per1 update_per2 update_per3 ... "); + System.exit(0); + }else{ + //---parse arguments--- + num_lines = Long.parseLong(args[0]); + int index=1; + ArrayList temp_array=null; + while(index pers_update=new ArrayList(); + ArrayList pers_select=new ArrayList(); + + //----------accept input arguments-------------- + if(args.length<=3){ + out.println("Please input num_lines " + + " max_exp_times -u update_per1 per2 per3 ... -s select_per1 per2 per3 ..."); + System.exit(0); + }else{ + num_lines_body = Long.parseLong(args[0]); + max_exp_times = Integer.parseInt(args[1]); + + if(args[2].equalsIgnoreCase("-u")){ + int i=3; + while(!args[i].equalsIgnoreCase("-s")){ + pers_update.add(Double.parseDouble(args[i++])); + } + i++; + while(i obtree_size_list=new ArrayList(); + for (double per_update:pers_update){ + out.println("per_update:"+per_update); + + String tbat_file_name1=data_folder+"tbat_l"+num_lines_body+"_p"+per_update+"_1.txt";//unclean + String tbat_file_name2=data_folder+"tbat_l"+num_lines_body+"_p"+per_update+"_2.txt";//merged + String update_file_name=data_folder+"update_l"+num_lines_body+"_p"+per_update+".txt"; + + int tbat_line_length= DataRetriever.getLineLength(tbat_file_name2); + + long btree_c_start=System.currentTimeMillis(); + OBTree obtree = new OBTree(); + obtree.loadUpdateFile(update_file_name); + Double btree_c_time=(double)(System.currentTimeMillis()-btree_c_start)/1000.0d; + //OB-tree size + double obtree_size=obtree.toKB(); + obtree_size_list.add(obtree_size); + result_file.format(format_string1, "obtree_size", per_update, obtree_size); + result_file.flush();//flush each time for long experiments + + }//end pers_update + + result_file.println(); + + result_file.format("%-15s\t %-15s\t\n", "obtree_size_mean", MathTool.mean(obtree_size_list)); + result_file.println(); + + //-------------summary and elapsed time calculation------ + long end=System.currentTimeMillis(); + double elapsedTime=(end-start)/1000.0; + out.println("%Elapsed Time:"+elapsedTime+"s"); + result_file.println("% Elapsed Time:"+elapsedTime+"s"); + String program_end_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + result_file.println("% Program Started at: "+program_start_date_time); + result_file.println("% Program Ended at: "+program_end_date_time); + result_file.println(); + result_file.close(); + }//---end of main--- + +} diff --git a/exp_select/obtree2/exp_select_obtree2.java b/exp_select/obtree2/exp_select_obtree2.java new file mode 100644 index 0000000..f1ab6c7 --- /dev/null +++ b/exp_select/obtree2/exp_select_obtree2.java @@ -0,0 +1,159 @@ +package exp_select.obtree2; + +import basic.btree.OBTree; +import basic.storage_model.BAT; +import basic.storage_model.TBAT; +import basic.util.DataCreator; +import basic.util.DataRetriever; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; +import java.util.List; + +import static java.lang.System.lineSeparator; +import static java.lang.System.out; + +public class exp_select_obtree2 { + + static String format_string1="%-15s\t %-15s\t %-15s\t %-15s\t %-15s\n"; + + public static void main(String args[]) throws IOException{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + long start=System.currentTimeMillis(); + PrintWriter result_file= new PrintWriter(new FileWriter("data/result-select.txt")); + String data_folder=prepare_files2.data_folder; + long num_lines_body = 0; + int max_exp_times=0; + ArrayList pers_update=new ArrayList(); + ArrayList pers_select=new ArrayList(); + + //----------accept input arguments-------------- + if(args.length<=3){ + out.println("Please input num_lines " + + " max_exp_times -u update_per1 per2 per3 ... -s select_per1 per2 per3 ..."); + System.exit(0); + }else{ + num_lines_body = Long.parseLong(args[0]); + max_exp_times = Integer.parseInt(args[1]); + + if(args[2].equalsIgnoreCase("-u")){ + int i=3; + while(!args[i].equalsIgnoreCase("-s")){ + pers_update.add(Double.parseDouble(args[i++])); + } + i++; + while(i appendixTree(i.e. obtree) + * 2.2 obtree selection + * use select_list + * 2.3 tbat (uncleaned, without index) selection + * selectTBAT_uncleaned + * 2.4 bat (updated) selection + * selectBAT + */ + + for (double per_update:pers_update){ + out.println("per_update:"+per_update); + + String tbat_file_name1=data_folder+"tbat_l"+num_lines_body+"_p"+per_update+"_1.txt";//unclean + String tbat_file_name2=data_folder+"tbat_l"+num_lines_body+"_p"+per_update+"_2.txt";//merged + String update_file_name=data_folder+"update_l"+num_lines_body+"_p"+per_update+".txt"; + + + int tbat_line_length= DataRetriever.getLineLength(tbat_file_name2); + + for (double per_select:pers_select){ + out.println("per_update:"+per_update+" | per_select:"+per_select); + + String select_file_name=data_folder+"select_l"+num_lines_body+"_p"+per_select+".txt"; + ArrayList bat_select_time_temp=new ArrayList(); + ArrayList tbat_select_time_temp=new ArrayList(); + + for(int i=1;i<=max_exp_times;i++){ + out.println("\tloop:"+i); + //---search tbat cleaned + long tbat_c_start=System.currentTimeMillis(); + TBAT.selectTBAT_body(tbat_file_name2,select_file_name,num_lines_body,tbat_line_length); + Double tbat_c_time=(double)(System.currentTimeMillis()-tbat_c_start)/1000.0d; + result_file.format(format_string1, "tbat_c_s", per_update, per_select, i, tbat_c_time);//record time + + //---search tbat uncleaned (without OB-tree index)--- + // too long we include the time of uncleaned search in 10000L and 10MB only +// long tbat_start=System.currentTimeMillis(); +// TBAT.selectTBAT_Uncleaned(tbat_file_name1,select_file_name,num_lines_body,tbat_line_length); +// Double tbat_time=(double)(System.currentTimeMillis()-tbat_start)/1000.0d; +// result_file.format(format_string1, "tbat_un_s", per_update, per_select, i, tbat_time); + + //---load update appendix into btree--- + //btree create start + long btree_c_start=System.currentTimeMillis(); + OBTree obtree = new OBTree(); + obtree.loadUpdateFile(update_file_name); + Double btree_c_time=(double)(System.currentTimeMillis()-btree_c_start)/1000.0d; + //OB-tree creation time + result_file.format(format_string1, "obtree_c", per_update, per_select, i, btree_c_time); + result_file.format(format_string1, "obtree_size", per_update, per_select, i, obtree.toKB()); + + //---search tbat (uncleaned) with OB-tree index--- + long btree_s_start=System.currentTimeMillis(); + TBAT.searchWithOBTree(obtree,tbat_file_name1,select_file_name,num_lines_body,tbat_line_length); + Double btree_s_time=(double)(System.currentTimeMillis()-btree_s_start)/1000.0d; + //record TBAT selection time with OB-tree + result_file.format(format_string1, "tbat_ob_s", per_update, per_select, i, btree_s_time); + + }//end max_exp_time + }//end pers_select + result_file.println("\n"); + result_file.flush();//flush each time for long experiments + + }//end pers_update + + //-------------summary and elapsed time calculation------ + long end=System.currentTimeMillis(); + double elapsedTime=(end-start)/1000.0; + out.println("%Elapsed Time:"+elapsedTime+"s"); + result_file.println("% Elapsed Time:"+elapsedTime+"s"); + String program_end_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + result_file.println("% Program Started at: "+program_start_date_time); + result_file.println("% Program Ended at: "+program_end_date_time); + result_file.close(); + }//---end of main--- + +} diff --git a/exp_select/obtree2/prepare_files2.java b/exp_select/obtree2/prepare_files2.java new file mode 100644 index 0000000..0781f40 --- /dev/null +++ b/exp_select/obtree2/prepare_files2.java @@ -0,0 +1,99 @@ +package exp_select.obtree2; + +import basic.util.*; + +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Calendar; + +import static java.lang.System.out; + +public class prepare_files2{ + + static String data_folder="data/obtree2/"; + + public static void main(String[] args) throws IOException { + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + long start=System.currentTimeMillis(); + + long num_lines = 0, max_exp_times=0; + ArrayList pers_update=new ArrayList(); + ArrayList pers_select=new ArrayList(); + + if(args.length<5){ + out.println("Please input: num_lines " + + "-u update_per1 update_per2 update_per3 ... " + + "-s select_per1 select_per2 select_per3 ... "); + System.exit(0); + }else{ + //---parse arguments--- + num_lines = Long.parseLong(args[0]); + int index=1; + ArrayList temp_array=null; + while(index0) + DataCreator.prepareSelectionFile5(select_file_name,per,num_lines); + } + + long end=System.currentTimeMillis(); + double elapsedTime=(end-start)/1000.0; + out.println("Elapsed Time:"+elapsedTime+"s"); + } + +} diff --git a/exp_update/exp_update.java b/exp_update/exp_update.java new file mode 100644 index 0000000..aaf0b40 --- /dev/null +++ b/exp_update/exp_update.java @@ -0,0 +1,170 @@ +package exp_update; + +import java.io.*; +import java.text.SimpleDateFormat; +import java.util.*; + +import basic.util.DataCreator; +import basic.util.DataUpdator; +import basic.util.MathTool; + +import static java.lang.System.out; + +public class exp_update { + + static int reader_buffer_size=1; // buffer size for BufferedReader (characters); normal default size is 8192 characters + + public static void main(String[] args) throws IOException{ + String program_start_date_time=new SimpleDateFormat("yyyy/MM/dd HH:mm:ssZ").format(Calendar.getInstance().getTime()); + long start=System.currentTimeMillis(); + String bat_file_name="data/bat.txt"; + String tbat_file_name="data/tbat.txt"; + PrintWriter result_file= new PrintWriter(new FileWriter("data/result.txt")); + + int num_lines = 0, max_exp_times=0; + ArrayList pers=new ArrayList(); + + if(args.length<=3){ + out.println("Please input num_lines " + + " max_exp_times per1 per2 per3 ... "); + System.exit(0); + }else{ + num_lines = Integer.parseInt(args[0]); + max_exp_times = Integer.parseInt(args[1]); + for(int i=2;i> bat_update_time_table = new HashMap>(); +// HashMap> tbat_update_time_table = new HashMap>(); + ArrayList bat_update_time_medians=new ArrayList(); + ArrayList tbat_update_time_medians=new ArrayList(); + ArrayList bat_update_time_means=new ArrayList(); + ArrayList tbat_update_time_means=new ArrayList(); + ArrayList bat_update_time_maxs=new ArrayList(); + ArrayList tbat_update_time_maxs=new ArrayList(); + ArrayList bat_update_time_mins=new ArrayList(); + ArrayList tbat_update_time_mins=new ArrayList(); + ArrayList overhead_medians=new ArrayList(); + ArrayList overhead_means=new ArrayList(); + + for(double per:pers){ + out.println("per:"+per); + + ArrayList bat_update_time_temp=new ArrayList(); + ArrayList tbat_update_time_temp=new ArrayList(); + + result_file.println("percentage = "+per+"\n"); + result_file.format("\t %-2s | %-10s | %-10s | %-10s\n", "i", "tbat_time", "bat_time", "overhead"); + + for(int i=0;i