diff --git a/src/compiled-proto/boa/types/Diff.java b/src/compiled-proto/boa/types/Diff.java
index 2967dd8b1..11e1d9cd7 100644
--- a/src/compiled-proto/boa/types/Diff.java
+++ b/src/compiled-proto/boa/types/Diff.java
@@ -249,6 +249,16 @@ public interface ChangedFileOrBuilder
      * </pre>
      */
     int getPreviousIndices(int index);
+
+    // optional int32 ast_key = 11;
+    /**
+     * <code>optional int32 ast_key = 11;</code>
+     */
+    boolean hasAstKey();
+    /**
+     * <code>optional int32 ast_key = 11;</code>
+     */
+    int getAstKey();
   }
   /**
    * Protobuf type {@code boa.types.ChangedFile}
@@ -438,6 +448,11 @@ private ChangedFile(
               input.popLimit(limit);
               break;
             }
+            case 88: {
+              bitField0_ |= 0x00000040;
+              astKey_ = input.readInt32();
+              break;
+            }
           }
         }
       } catch (com.google.protobuf.InvalidProtocolBufferException e) {
@@ -1780,6 +1795,22 @@ public int getPreviousIndices(int index) {
       return previousIndices_.get(index);
     }
 
+    // optional int32 ast_key = 11;
+    public static final int AST_KEY_FIELD_NUMBER = 11;
+    private int astKey_;
+    /**
+     * <code>optional int32 ast_key = 11;</code>
+     */
+    public boolean hasAstKey() {
+      return ((bitField0_ & 0x00000040) == 0x00000040);
+    }
+    /**
+     * <code>optional int32 ast_key = 11;</code>
+     */
+    public int getAstKey() {
+      return astKey_;
+    }
+
     private void initFields() {
       change_ = boa.types.Shared.ChangeKind.UNKNOWN;
       kind_ = boa.types.Diff.ChangedFile.FileKind.OTHER;
@@ -1791,6 +1822,7 @@ private void initFields() {
       previousNames_ = com.google.protobuf.LazyStringArrayList.EMPTY;
       previousVersions_ = java.util.Collections.emptyList();
       previousIndices_ = java.util.Collections.emptyList();
+      astKey_ = 0;
     }
     private byte memoizedIsInitialized = -1;
     public final boolean isInitialized() {
@@ -1860,6 +1892,9 @@ public void writeTo(com.google.protobuf.CodedOutputStream output)
       for (int i = 0; i < previousIndices_.size(); i++) {
         output.writeInt32(10, previousIndices_.get(i));
       }
+      if (((bitField0_ & 0x00000040) == 0x00000040)) {
+        output.writeInt32(11, astKey_);
+      }
       getUnknownFields().writeTo(output);
     }
 
@@ -1929,6 +1964,10 @@ public int getSerializedSize() {
         size += dataSize;
         size += 1 * getPreviousIndicesList().size();
       }
+      if (((bitField0_ & 0x00000040) == 0x00000040)) {
+        size += com.google.protobuf.CodedOutputStream
+          .computeInt32Size(11, astKey_);
+      }
       size += getUnknownFields().getSerializedSize();
       memoizedSerializedSize = size;
       return size;
@@ -2074,6 +2113,8 @@ public Builder clear() {
         bitField0_ = (bitField0_ & ~0x00000100);
         previousIndices_ = java.util.Collections.emptyList();
         bitField0_ = (bitField0_ & ~0x00000200);
+        astKey_ = 0;
+        bitField0_ = (bitField0_ & ~0x00000400);
         return this;
       }
 
@@ -2151,6 +2192,10 @@ public boa.types.Diff.ChangedFile buildPartial() {
           bitField0_ = (bitField0_ & ~0x00000200);
         }
         result.previousIndices_ = previousIndices_;
+        if (((from_bitField0_ & 0x00000400) == 0x00000400)) {
+          to_bitField0_ |= 0x00000040;
+        }
+        result.astKey_ = astKey_;
         result.bitField0_ = to_bitField0_;
         onBuilt();
         return result;
@@ -2227,6 +2272,9 @@ public Builder mergeFrom(boa.types.Diff.ChangedFile other) {
           }
           onChanged();
         }
+        if (other.hasAstKey()) {
+          setAstKey(other.getAstKey());
+        }
         this.mergeUnknownFields(other.getUnknownFields());
         return this;
       }
@@ -3150,6 +3198,39 @@ public Builder clearPreviousIndices() {
         return this;
       }
 
+      // optional int32 ast_key = 11;
+      private int astKey_ ;
+      /**
+       * <code>optional int32 ast_key = 11;</code>
+       */
+      public boolean hasAstKey() {
+        return ((bitField0_ & 0x00000400) == 0x00000400);
+      }
+      /**
+       * <code>optional int32 ast_key = 11;</code>
+       */
+      public int getAstKey() {
+        return astKey_;
+      }
+      /**
+       * <code>optional int32 ast_key = 11;</code>
+       */
+      public Builder setAstKey(int value) {
+        bitField0_ |= 0x00000400;
+        astKey_ = value;
+        onChanged();
+        return this;
+      }
+      /**
+       * <code>optional int32 ast_key = 11;</code>
+       */
+      public Builder clearAstKey() {
+        bitField0_ = (bitField0_ & ~0x00000400);
+        astKey_ = 0;
+        onChanged();
+        return this;
+      }
+
       // @@protoc_insertion_point(builder_scope:boa.types.ChangedFile)
     }
 
@@ -3176,38 +3257,38 @@ public Builder clearPreviousIndices() {
   static {
     java.lang.String[] descriptorData = {
       "\n\ndiff.proto\022\tboa.types\032\014shared.proto\032\ta" +
-      "st.proto\"\325\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" +
+      "st.proto\"\346\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" +
       "2\025.boa.types.ChangeKind\022-\n\004kind\030\002 \002(\0162\037." +
       "boa.types.ChangedFile.FileKind\022\014\n\004name\030\003" +
       " \002(\t\022\013\n\003key\030\004 \002(\004\022\013\n\003ast\030\005 \002(\010\022)\n\010commen" +
       "ts\030\006 \001(\0132\027.boa.types.CommentsRoot\022&\n\007cha" +
       "nges\030\007 \003(\0162\025.boa.types.ChangeKind\022\026\n\016pre" +
       "vious_names\030\010 \003(\t\022\031\n\021previous_versions\030\t" +
-      " \003(\005\022\030\n\020previous_indices\030\n \003(\005\"\247\007\n\010FileK" +
-      "ind\022\t\n\005OTHER\020\000\022\n\n\006BINARY\020\001\022\010\n\004TEXT\020\002\022\007\n\003",
-      "XML\020\003\022\025\n\021SOURCE_JAVA_ERROR\020d\022\024\n\020SOURCE_J" +
-      "AVA_JLS2\020f\022\024\n\020SOURCE_JAVA_JLS3\020g\022\024\n\020SOUR" +
-      "CE_JAVA_JLS4\020h\022\024\n\020SOURCE_JAVA_JLS8\020l\022\016\n\n" +
-      "JAVA_ERROR\020d\022\010\n\004JLS2\020f\022\010\n\004JLS3\020g\022\010\n\004JLS4" +
-      "\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_CS_ERROR\020\310\001\022\022\n\rSO" +
-      "URCE_CS_CS1\020\311\001\022\022\n\rSOURCE_CS_CS2\020\312\001\022\022\n\rSO" +
-      "URCE_CS_CS3\020\313\001\022\022\n\rSOURCE_CS_CS4\020\314\001\022\022\n\rSO" +
-      "URCE_CS_CS5\020\315\001\022\r\n\010CS_ERROR\020\310\001\022\010\n\003CS1\020\311\001\022" +
-      "\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003CS4\020\314\001\022\010\n\003CS5\020\315\001\022" +
-      "\024\n\017SOURCE_JS_ERROR\020\254\002\022\022\n\rSOURCE_JS_ES1\020\255",
-      "\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022\n\rSOURCE_JS_ES3\020\257" +
-      "\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022\n\rSOURCE_JS_ES6\020\261" +
-      "\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022\n\rSOURCE_JS_ES8\020\263" +
-      "\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOURCE_PHP_ERROR\020\220\003\022" +
-      "\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSOURCE_PHP5_3\020\222\003\022\022\n" +
-      "\rSOURCE_PHP5_4\020\223\003\022\022\n\rSOURCE_PHP5_5\020\224\003\022\022\n" +
-      "\rSOURCE_PHP5_6\020\225\003\022\022\n\rSOURCE_PHP7_0\020\226\003\022\022\n" +
-      "\rSOURCE_PHP7_1\020\227\003\022\016\n\tPHP_ERROR\020\220\003\022\026\n\021SOU" +
-      "RCE_HTML_ERROR\020\364\003\022\020\n\013Source_HTML\020\365\003\022\017\n\nH" +
-      "TML_ERROR\020\364\003\022\025\n\020SOURCE_XML_ERROR\020\330\004\022\017\n\nS",
-      "ource_XML\020\331\004\022\016\n\tXML_ERROR\020\330\004\022\025\n\020SOURCE_C" +
-      "SS_ERROR\020\274\005\022\017\n\nSource_CSS\020\275\005\022\016\n\tCSS_ERRO" +
-      "R\020\274\005\032\002\020\001B\002H\001"
+      " \003(\005\022\030\n\020previous_indices\030\n \003(\005\022\017\n\007ast_ke" +
+      "y\030\013 \001(\005\"\247\007\n\010FileKind\022\t\n\005OTHER\020\000\022\n\n\006BINAR",
+      "Y\020\001\022\010\n\004TEXT\020\002\022\007\n\003XML\020\003\022\025\n\021SOURCE_JAVA_ER" +
+      "ROR\020d\022\024\n\020SOURCE_JAVA_JLS2\020f\022\024\n\020SOURCE_JA" +
+      "VA_JLS3\020g\022\024\n\020SOURCE_JAVA_JLS4\020h\022\024\n\020SOURC" +
+      "E_JAVA_JLS8\020l\022\016\n\nJAVA_ERROR\020d\022\010\n\004JLS2\020f\022" +
+      "\010\n\004JLS3\020g\022\010\n\004JLS4\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_" +
+      "CS_ERROR\020\310\001\022\022\n\rSOURCE_CS_CS1\020\311\001\022\022\n\rSOURC" +
+      "E_CS_CS2\020\312\001\022\022\n\rSOURCE_CS_CS3\020\313\001\022\022\n\rSOURC" +
+      "E_CS_CS4\020\314\001\022\022\n\rSOURCE_CS_CS5\020\315\001\022\r\n\010CS_ER" +
+      "ROR\020\310\001\022\010\n\003CS1\020\311\001\022\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003" +
+      "CS4\020\314\001\022\010\n\003CS5\020\315\001\022\024\n\017SOURCE_JS_ERROR\020\254\002\022\022",
+      "\n\rSOURCE_JS_ES1\020\255\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022" +
+      "\n\rSOURCE_JS_ES3\020\257\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022" +
+      "\n\rSOURCE_JS_ES6\020\261\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022" +
+      "\n\rSOURCE_JS_ES8\020\263\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOU" +
+      "RCE_PHP_ERROR\020\220\003\022\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSO" +
+      "URCE_PHP5_3\020\222\003\022\022\n\rSOURCE_PHP5_4\020\223\003\022\022\n\rSO" +
+      "URCE_PHP5_5\020\224\003\022\022\n\rSOURCE_PHP5_6\020\225\003\022\022\n\rSO" +
+      "URCE_PHP7_0\020\226\003\022\022\n\rSOURCE_PHP7_1\020\227\003\022\016\n\tPH" +
+      "P_ERROR\020\220\003\022\026\n\021SOURCE_HTML_ERROR\020\364\003\022\020\n\013So" +
+      "urce_HTML\020\365\003\022\017\n\nHTML_ERROR\020\364\003\022\025\n\020SOURCE_",
+      "XML_ERROR\020\330\004\022\017\n\nSource_XML\020\331\004\022\016\n\tXML_ERR" +
+      "OR\020\330\004\022\025\n\020SOURCE_CSS_ERROR\020\274\005\022\017\n\nSource_C" +
+      "SS\020\275\005\022\016\n\tCSS_ERROR\020\274\005\032\002\020\001B\002H\001"
     };
     com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
       new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@@ -3219,7 +3300,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors(
           internal_static_boa_types_ChangedFile_fieldAccessorTable = new
             com.google.protobuf.GeneratedMessage.FieldAccessorTable(
               internal_static_boa_types_ChangedFile_descriptor,
-              new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", });
+              new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", "AstKey", });
           return null;
         }
       };
diff --git a/src/java/boa/datagen/forges/github/GetReposByLanguage.java b/src/java/boa/datagen/forges/github/GetReposByLanguage.java
index 4c5dd5ba8..8eeef9135 100644
--- a/src/java/boa/datagen/forges/github/GetReposByLanguage.java
+++ b/src/java/boa/datagen/forges/github/GetReposByLanguage.java
@@ -1,26 +1,49 @@
 package boa.datagen.forges.github;
 
+import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Date;
+import java.util.HashSet;
 
 import com.google.gson.Gson;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonElement;
 import com.google.gson.JsonObject;
-import boa.datagen.util.FileIO;
 
+import boa.datagen.util.FileIO;
 
 public class GetReposByLanguage {
-	
+
+	// GET PROJECT WITH STARS LARGER OR EQUAL TO THIS NUMBER
+	static int stars = 0;
+	static long start, stop = 0;
+
 	public static void main(String[] args) {
+
+		if (args.length < 4) {
+			System.out.println("args: TOKEN_FILE_INPUT_PATH, OUTPUT_PATH, STARS, LANGS");
+			return;
+		}
+		
 		TokenList tokens = new TokenList(args[0]);
 		String outDir = args[1];
-		String[] languages = { "java" };
-		if (args.length > 2) {
-			languages = new String[args.length - 2];
-			for (int i = 2; i < args.length; i++)
-				languages[i - 2] = args[i];
+		stars = Integer.parseInt(args[2]);
+		String[] languages = args[3].split(";");
+
+		if (args.length > 3) {
+			String langArgs = "";
+			for (int i = 3; i < args.length; i++) {
+				langArgs += args[i];
+			}
+			languages = langArgs.split(",");
+			for (int i = 0; i < languages.length; i++)
+				languages[i] = languages[i].trim();
 		}
 		Thread[] workers = new Thread[languages.length];
 		for (int i = 0; i < languages.length; i++) {
@@ -35,91 +58,115 @@ public static void main(String[] args) {
 					e.printStackTrace();
 				}
 	}
-	
-	
+
 	public static class Worker implements Runnable {
-		private final int id; 
+		private final int id;
 		private final String language;
 		private TokenList tokens;
 		private final String outDir;
 		private JsonArray repos = new JsonArray();
 		private final int RECORDS_PER_FILE = 100;
 		private int counter = 0;
-		
+		private HashSet<Integer> processedRepID = new HashSet<>();
+		private ArrayList<Integer> IDtoWrite = new ArrayList<>();
+
 		public Worker(int id, String language, String outDir, TokenList tokenList) {
 			this.id = id;
 			this.language = language;
 			this.outDir = outDir;
 			this.tokens = tokenList;
+			File processedRepos = new File(outDir + "/" + language + "processed.txt");
+			try {
+				processedRepos.createNewFile();
+				Files.lines(processedRepos.toPath()).forEach(repID -> processedRepID.add(Integer.parseInt(repID)));
+			} catch (IOException e) {
+				e.printStackTrace();
+			}
 		}
-		
+
 		@Override
 		public void run() {
+			start = System.currentTimeMillis();
 			Calendar cal = Calendar.getInstance();
 			cal.setTime(new Date());
 			cal.add(Calendar.DATE, 1);
 			int year = cal.get(Calendar.YEAR);
 			int month = cal.get(Calendar.MONTH) + 1; // month starts from 0
 			int day = cal.get(Calendar.DAY_OF_MONTH);
+
+			String monthString = month < 10 ? "0" + month : String.valueOf(month);
+			String dayString = day < 10 ? "0" + day : String.valueOf(day);
+			String time = year + "-" + monthString + "-" + dayString + "T23:59:59Z";
+
 //			String time = "2018-12-21T01:01:01Z";
-			String time = year + "-" + month + "-" + day + "T23:59:59Z";
+//			String time = year + "-" + month + "-" + day + "T23:59:59Z";
 			Gson parser = new Gson();
 			
-			while (true){
-				Token tok = this.tokens.getNextAuthenticToken("https://api.github.com/repositories");
-				String url = "https://api.github.com/search/repositories?q=language:" + language +"+stars:>1+pushed:<=" + time + "&sort=updated&order=desc&per_page=100";
-				System.out.println(url);
-				MetadataCacher mc = new MetadataCacher(url, tok.getUserName(), tok.getToken());
-				mc.authenticate();
-				while (!mc.isAuthenticated() || mc.getNumberOfRemainingLimit() <= 0) {
-					try {
-						Thread.sleep(1000);
-					} catch (InterruptedException e1) {
-						e1.printStackTrace();
-					}
-					mc = new MetadataCacher(url, tok.getUserName(), tok.getToken());
-					mc.authenticate();
-				}
-				mc.getResponseJson();
-				String content = mc.getContent();
+			Token tokenGetAPI = this.tokens.getNextAuthenticTokenM("https://api.github.com");
+			Token tokenSearch = this.tokens.getNextAuthenticTokenM("https://api.github.com/search/repositories?q=language");
+			MetadataCacher mcGetAPI = new MetadataCacher("https://api.github.com/repositories", tokenGetAPI.getUserName(), tokenGetAPI.getToken());
+			mcGetAPI.authenticate();
+			while (true) {
+				String searchURL = "https://api.github.com/search/repositories?q=language:" + language + "+stars:>=" + stars
+						+ "+pushed:<=" + time + "&sort=updated&order=desc&per_page=100";
+				System.out.println(searchURL);
+				MetadataCacher mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken());
+				mcSearch.authenticate();
+				// We don't need to check for remaining limit for search because 30 requests/1 min is way faster than we could process, could be
+				// useful if in the future we can find a way to get 100 repos in 2 secs.
+//				while (mcSearch.getNumberOfRemainingLimit() <= 0) {
+//					System.out.println("user: " + tokenSearch.getUserName() + " limit: " + mcSearch.getNumberOfRemainingLimit());
+//					tokenSearch = this.tokens.getNextAuthenticToken(searchURL);
+//					mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken());
+//				}
 				
+				mcSearch.getResponseJson();
+				String content = mcSearch.getContent();
+
 				JsonObject json = null;
 				json = parser.fromJson(content, JsonElement.class).getAsJsonObject();
-		        JsonArray items = json.getAsJsonArray("items");
-		        if (items.size() > 0) {
-			        for (int j = 0; j < items.size(); j++) {
-			        	JsonObject item = items.get(j).getAsJsonObject();
-			        	this.addRepo(item);
-			        	String pushed = item.get("pushed_at").getAsString();
-			        	if (pushed.compareTo(time) < 0){
-			        		time = pushed;
-			        	}
-			        }
-		        }
-		        int count = json.get("total_count").getAsInt();
-		        if (count == items.size())
-		        	break;
-				if (tok.getNumberOfRemainingLimit() <= 1) {
-					long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis();
-					if (t >= 0) {
-						System.out.println("Waiting " + (t/1000) + " seconds for sending more requests.");
-						try {
-							Thread.sleep(t);
-						} catch (InterruptedException e) {
-							e.printStackTrace();
+				JsonArray items = json.getAsJsonArray("items");
+				
+				if (items.size() > 0) {
+					int getLimit = mcGetAPI.getNumberOfRemainingLimit();
+					System.out.println("Get API rate limit remaining: " + getLimit);
+					
+					if (getLimit < items.size() + 1 || !mcGetAPI.isAuthenticated()) {
+						tokenGetAPI = this.tokens.getNextAuthenticToken("https://api.github.com/repositories", items.size() + 2); // 1 for getting authentic token, 1 for authenticating language link
+						mcGetAPI = new MetadataCacher(mcGetAPI.getUrl(), tokenGetAPI.getUserName(), tokenGetAPI.getToken());
+					}
+					for (int j = 0; j < items.size(); j++) {
+						JsonObject item = items.get(j).getAsJsonObject();
+						// check if repository is already saved
+						int repID = item.get("id").getAsInt();
+						if (!processedRepID.contains(repID)) {
+							mcGetAPI = addLanguageToRepo(item, parser, mcGetAPI);
+
+							this.addRepo(item);
+							processedRepID.add(repID);
+							IDtoWrite.add(repID);
+						} else {
+							System.out.println(repID + " already written");
+						}
+						String pushed = item.get("pushed_at").getAsString();
+						if (pushed.compareTo(time) < 0) {
+							time = pushed;
 						}
 					}
+					System.out.println(" ");
 				}
+				int count = json.get("total_count").getAsInt(); // count will not be static
+				if (count == items.size())
+					break;
 			}
 			writeRemainingRepos();
-		} 
-		
+		}
+
 		private void addRepo(JsonObject repo) {
 			File fileToWriteJson = null;
 			this.repos.add(repo);
 			if (this.repos.size() % RECORDS_PER_FILE == 0) {
-				fileToWriteJson = new File(
-						outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
+				fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
 				while (fileToWriteJson.exists()) {
 					System.out.println(fileToWriteJson.getAbsolutePath() + " arleady exist");
 					counter++;
@@ -128,6 +175,20 @@ private void addRepo(JsonObject repo) {
 				FileIO.writeFileContents(fileToWriteJson, this.repos.toString());
 				System.out.println(Thread.currentThread().getId() + " " + counter++);
 				this.repos = new JsonArray();
+				try (FileWriter fw = new FileWriter(outDir + "/" + language + "processed.txt", true);
+						BufferedWriter bw = new BufferedWriter(fw);
+						PrintWriter out = new PrintWriter(bw)) {
+					for (Integer repID : IDtoWrite) {
+						out.println(repID.intValue());
+					}
+				} catch (IOException e) {
+					e.printStackTrace();
+				} finally {
+					IDtoWrite.clear();
+				}
+				stop = System.currentTimeMillis();
+				System.out.println("Time taken: " + (stop - start) / 1000.0 + "seconds");
+				start = stop;
 			}
 		}
 
@@ -140,8 +201,25 @@ public void writeRemainingRepos() {
 					fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
 				}
 				FileIO.writeFileContents(fileToWriteJson, this.repos.toString());
-				System.out.println(this.id  + counter++);
+				System.out.println(this.id + counter++);
 			}
 		}
+
+		// Returning metadataCacher so that we don't have to authenticate again to get remaining limit
+		private MetadataCacher addLanguageToRepo(JsonObject repo, Gson parser, MetadataCacher mc) {
+			String langurl = "https://api.github.com/repos/" + repo.get("full_name").getAsString() + "/languages";
+			mc = new MetadataCacher(langurl, mc.getUsername(), mc.getPassword());
+			if (mc.authenticate()) {
+				mc.getResponse();
+				String pageContent = mc.getContent();
+				JsonObject languages = parser.fromJson(pageContent, JsonElement.class).getAsJsonObject();
+				repo.add("language_list", languages);
+			} else {
+				final int responsecode = mc.getResponseCode();
+				System.err.println("authentication error " + responsecode);
+			}
+			return mc;
+		}
+
 	}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/MetadataCacher.java b/src/java/boa/datagen/forges/github/MetadataCacher.java
index 5966292bc..ef4be5ede 100644
--- a/src/java/boa/datagen/forges/github/MetadataCacher.java
+++ b/src/java/boa/datagen/forges/github/MetadataCacher.java
@@ -44,14 +44,26 @@ public void setUrl(String url) {
 			e.printStackTrace();
 		}
 	}
+	
+	public String getUserName() {
+		return this.username;
+	}
 
 	public void setUsername(String username) {
 		this.username = username;
 	}
+	
+	public String getUsername() {
+		return username;
+	}
 
 	public void setPassword(String password) {
 		this.password = password;
 	}
+	
+	public String getPassword() {
+		return password;
+	}
 
 	public boolean isAuthenticated() {
 		return authenticated;
@@ -79,6 +91,7 @@ public boolean authenticate(String username, String password) {
 		} catch (IOException e) {
 			// considered as failed
 		}
+//		System.out.println("authenticate: " + this.authenticated);
 		return this.authenticated;
 	}
 	
@@ -151,10 +164,14 @@ public int getNumberOfMaxLimit() {
 	}
 	
 	public int getNumberOfRemainingLimit() {
-		return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining"));
+		try {
+			return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining"));
+		} catch(NumberFormatException e) {
+			return -1;
+		}
 	}
 	
 	public long getLimitResetTime() {
 		return Long.parseLong(this.connection.getHeaderField("X-RateLimit-Reset"));
 	}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/TokenList.java b/src/java/boa/datagen/forges/github/TokenList.java
index dca3a7e1c..6907e7d95 100644
--- a/src/java/boa/datagen/forges/github/TokenList.java
+++ b/src/java/boa/datagen/forges/github/TokenList.java
@@ -33,17 +33,18 @@ public TokenList(String path) {
 		}
 	}
 
+
 	public Token getNextAuthenticToken(String url) {
 		MetadataCacher mc = null;
 		while (true) {
 			for (Token token : tokens) {
-				// System.out.println("Trying token: " + token.getId());
 				mc = new MetadataCacher(url, token.getUserName(), token.getToken());
-				if (mc.authenticate()) {
+				if (mc.authenticate() && mc.getNumberOfRemainingLimit() >= 1) {
 					if (this.lastUsedToken != token.getId()) {
 						this.lastUsedToken = token.getId();
-						System.out.println("now using token: " + token.getId());
+//						System.out.println("now using token: " + token.getId());
 					}
+//					System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName());
 					return token;
 				}
 			}
@@ -57,6 +58,105 @@ public Token getNextAuthenticToken(String url) {
 
 		// throw new IllegalArgumentException();
 	}
+	
+	public MetadataCacher getNextAuthenticMetadataCacher(String url) {
+		MetadataCacher mc = null;
+		while (true) {
+			for (Token token : tokens) {
+				System.out.println("Trying token " + token.getId());
+				mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+				if (mc.authenticate()) {
+					if (this.lastUsedToken != token.getId()) {
+						this.lastUsedToken = token.getId();
+//						System.out.println("now using token: " + token.getId());
+					}
+					System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName());
+					return mc;
+				}
+				// the web is 404
+				if (mc.getNumberOfRemainingLimit() >= 1)
+					return null;
+			}
+			try {
+				System.out.println("waiting for token, going to sleep for 10s");
+				Thread.sleep(10000);
+			} catch (InterruptedException e) {
+				e.printStackTrace();
+			}
+		}
+
+		// throw new IllegalArgumentException();
+	}
+	
+	public Token getNextAuthenticTokenM(String url) {
+		MetadataCacher mc = null;
+		while (true) {
+			for (Token token : tokens) {
+				mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+				System.out.println("Trying token: " + token.getUserName() + " for " + url.substring(0, Math.min(url.length(), 40)) + ((url.length() < 40) ? "" : "..."));
+				mc.authenticate();
+				System.out.println(mc.getUsername() + " " + mc.isAuthenticated());
+				if (mc.getNumberOfRemainingLimit() >= 0) {
+					System.out.println(mc.getNumberOfRemainingLimit());
+					if (this.lastUsedToken != token.getId()) {
+						this.lastUsedToken = token.getId();
+						System.out.println("now using token: " + token.getUserName());
+					}
+					return token;
+				}
+			}
+			try {
+//				long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis();
+//				if (t >= 0) { // could be useful if json is created too fast
+//					System.out.println("Waiting " + (t / 1000) + " seconds for sending more requests.");
+//					try {
+//						Thread.sleep(t);
+//					} catch (InterruptedException e) {
+//						e.printStackTrace();
+//					}
+//				}
+				System.out.println("waiting for token, going to sleep for 10s");
+				Thread.sleep(10000);
+			} catch (InterruptedException e) {
+				e.printStackTrace();
+			}
+		}
+		// throw new IllegalArgumentException();
+	}
+	
+	
+	public Token getNextAuthenticToken(String url, int minRateLimit) {
+		MetadataCacher mc = null;
+		while (true) {
+			for (Token token : tokens) {
+				System.out.println("Trying token: " + token.getUserName() + " ");
+				mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+				mc.authenticate();
+				if (mc.isAuthenticated()) {
+					int limitRemaining = mc.getNumberOfRemainingLimit();
+					if (limitRemaining < minRateLimit) {
+						System.out.println("Authenticated but have " + limitRemaining + " than min rate limit of " + minRateLimit);
+						continue;	
+					}
+					if (this.lastUsedToken != token.getId()) {
+						this.lastUsedToken = token.getId();
+						System.out.println("Now using token: " + token.getUserName() + " ");
+					}
+					System.out.println(mc.getNumberOfRemainingLimit());
+					return token;
+				} else {
+					System.err.println("Can't authenticate, response code:" + mc.getResponseCode());
+				}
+			}
+			try {
+				System.out.println("waiting for token, going to sleep for 10s");
+				Thread.sleep(10000);
+			} catch (InterruptedException e) {
+				e.printStackTrace();
+			}
+		}
+		// throw new IllegalArgumentException();
+	}
 
 	public synchronized Token getAuthenticatedToken(long threadId) {
 		while (true) {
@@ -83,4 +183,4 @@ public synchronized void removeToken(Token tok) {
 	public synchronized void addToken(Token tok) {
 		this.tokens.add(tok);
 	}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
similarity index 65%
rename from src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java
rename to src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
index b0e435b4b..2c59b6452 100644
--- a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java
+++ b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
@@ -1,4 +1,4 @@
-package boa.datagen.forges.github;
+package boa.datagen.slurm;
 
 import java.io.File;
 import org.eclipse.jgit.api.Git;
@@ -8,6 +8,7 @@
 
 import boa.datagen.util.FileIO;
 
+// Datagen Phase 1: download all bare repositories
 public class GitHubRepoBareDownloader {
 
 	private static String INPUT_PATH; // The directory contains a list of repo json files
@@ -19,62 +20,63 @@ public static void main(String[] args) {
 
 		if (args.length < 3) {
 			System.out.println("args: INPUT_NAMES_PATH, OUTPUT_REPOS_PATH, THREAD_NUM");
-		} else {
-			INPUT_PATH = args[0];
-			OUTPUT_REPOS_PATH = args[1];
-			THREAD_NUM = Integer.parseInt(args[2]);
-
-			File input = new File(INPUT_PATH);
-
-			DownloadWorker[] workers = new DownloadWorker[THREAD_NUM];
-			Thread[] threads = new Thread[THREAD_NUM];
-			for (int i = 0; i < THREAD_NUM; i++) {
-				workers[i] = new DownloadWorker(i);
-				threads[i] = new Thread(workers[i]);
-				threads[i].start();
-			}
+			return;
+		}
 
-			// assign tasks to workers
-			for (File file : input.listFiles()) {
-				if (!file.getName().endsWith(".json"))
-					continue;
-				JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file));
-				for (JsonElement je : jsonTree.getAsJsonArray()) {
-					String projectName = je.getAsJsonObject().get("html_url").getAsString()
-							.replace("https://github.com/", "");
-
-					boolean assigned = false;
-					while (!assigned) {
-						for (int j = 0; j < THREAD_NUM; j++) {
-							if (workers[j].isReady()) {
-								workers[j].setName(projectName);
-								workers[j].setReady(false);
-								assigned = true;
-								break;
-							}
-						}
-						try {
-							Thread.sleep(100);
-						} catch (InterruptedException e) {
-							e.printStackTrace();
+		INPUT_PATH = args[0];
+		OUTPUT_REPOS_PATH = args[1];
+		THREAD_NUM = Integer.parseInt(args[2]);
+
+		File input = new File(INPUT_PATH);
+
+		DownloadWorker[] workers = new DownloadWorker[THREAD_NUM];
+		Thread[] threads = new Thread[THREAD_NUM];
+		for (int i = 0; i < THREAD_NUM; i++) {
+			workers[i] = new DownloadWorker(i);
+			threads[i] = new Thread(workers[i]);
+			threads[i].start();
+		}
+
+		// assign tasks to workers
+		for (File file : input.listFiles()) {
+			if (!file.getName().endsWith(".json"))
+				continue;
+			JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file));
+			for (JsonElement je : jsonTree.getAsJsonArray()) {
+				String projectName = je.getAsJsonObject().get("html_url").getAsString().replace("https://github.com/",
+						"");
+
+				boolean assigned = false;
+				while (!assigned) {
+					for (int j = 0; j < THREAD_NUM; j++) {
+						if (workers[j].isReady()) {
+							workers[j].setName(projectName);
+							workers[j].setReady(false);
+							assigned = true;
+							break;
 						}
 					}
-				}
-			}
-
-			// wait for all done
-			for (int j = 0; j < THREAD_NUM; j++) {
-				while (!workers[j].isReady())
 					try {
 						Thread.sleep(100);
 					} catch (InterruptedException e) {
 						e.printStackTrace();
 					}
+				}
 			}
+		}
 
-			setDone(true);
+		// wait for all done
+		for (int j = 0; j < THREAD_NUM; j++) {
+			while (!workers[j].isReady())
+				try {
+					Thread.sleep(100);
+				} catch (InterruptedException e) {
+					e.printStackTrace();
+				}
 		}
 
+		setDone(true);
+
 	}
 
 	synchronized static boolean getDone() {
@@ -138,7 +140,7 @@ private void runJob() {
 						result.getRepository().close();
 				}
 			} else {
-				System.out.println("repo " + projectName + "already exists");
+				System.out.println("repo " + projectName + " already exists");
 			}
 		}
 
diff --git a/src/java/boa/datagen/slurm/SeqRepoBuilder.java b/src/java/boa/datagen/slurm/SeqRepoBuilder.java
new file mode 100644
index 000000000..9c182ca73
--- /dev/null
+++ b/src/java/boa/datagen/slurm/SeqRepoBuilder.java
@@ -0,0 +1,304 @@
+package boa.datagen.slurm;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+import boa.datagen.DefaultProperties;
+import boa.datagen.forges.github.RepoMetadata;
+import boa.datagen.scm.AbstractConnector;
+import boa.datagen.scm.GitConnector;
+import boa.datagen.util.FileIO;
+import boa.types.Code.CodeRepository;
+import boa.types.Code.Revision;
+import boa.types.Toplevel.Project;
+
+//Datagen Phase 2: DATAGEN_JAR used for each slurm job 
+public class SeqRepoBuilder {
+
+	private static String REPO_PATH;
+	private static String JSON_FILES_PATH;
+	private static String OUTPUT_PATH;
+
+	private static Configuration conf = null;
+	private static FileSystem fileSystem = null;
+
+	private static String suffix;
+	private static SequenceFile.Writer projectWriter, astWriter, commitWriter, contentWriter;
+	private static long astWriterLen = 1, commitWriterLen = 1, contentWriterLen = 1;
+
+	private static int MAX_COMMITS = Integer.valueOf(DefaultProperties.MAX_SIZE_FOR_PROJECT_WITH_COMMITS);;
+
+	public static void main(String[] args) throws IOException {
+
+		if (args.length < 3) {
+			System.err.println("Need args:\n" + "REPO_PATH\n" + "JSON_FILES_PATH\n" + "OUTPUT_PATH\n");
+			return;
+		}
+
+		REPO_PATH = args[0];
+		JSON_FILES_PATH = args[1];
+		OUTPUT_PATH = args[2];
+
+		conf = new Configuration();
+		fileSystem = FileSystem.get(conf);
+		boa.datagen.DefaultProperties.DEBUG = true;
+
+		int counter = 0;
+		for (String jsonFilePath : getJsonFilePaths()) {
+			File file = new File(jsonFilePath);
+			String content = FileIO.readFileContents(file);
+			Gson parser = new Gson();
+			JsonArray repoArray = null;
+			try {
+				repoArray = parser.fromJson(content, JsonElement.class).getAsJsonArray();
+			} catch (Exception e) {
+				System.err.println("Error proccessing page: " + file.getPath());
+				e.printStackTrace();
+				continue;
+			}
+			// iterate each json object (project metadata) in the json array
+			for (int i = 0; i < repoArray.size(); i++) {
+				JsonObject rp = repoArray.get(i).getAsJsonObject();
+				RepoMetadata repo = new RepoMetadata(rp);
+				if (repo.id != null && repo.name != null) {
+					System.out.println("Processing the " + (++counter) + "th project: " + repo.name);
+					// generate seq files for this project
+					Project project = repo.toBoaMetaDataProtobuf();
+					process(project);
+				}
+			}
+		}
+
+		// done
+	}
+
+	private static void process(Project project) {
+		String projectName = project.getName();
+		String[] writerPaths = openWriters(projectName);
+
+		// if writerPaths is null, then the project is processed.
+		if (writerPaths == null) {
+			System.out.println(projectName + " seq file is already existing");
+			return;
+		}
+
+		try {
+			project = storeRepository(project, 0);
+			// if the project is null then skip this project
+			if (project == null) {
+				System.out.println(projectName + " is null skip this");
+				clear(writerPaths);
+				return;
+			}
+
+			// store project into sequence file
+			BytesWritable bw = new BytesWritable(project.toByteArray());
+			if (bw.getLength() <= MAX_COMMITS || (project.getCodeRepositoriesCount() > 0
+					&& project.getCodeRepositories(0).getRevisionKeysCount() > 0)) {
+				// Approach 1: if the Project size is acceptable, then directly append the
+				// Project instance into the sequence file
+				projectWriter.append(new Text(project.getId()), bw);
+			} else {
+				// Approach 2: if the size is too large, extract Commit instances and append
+				// them into commit sequence file.
+				Project.Builder pb = Project.newBuilder(project);
+				for (CodeRepository.Builder cb : pb.getCodeRepositoriesBuilderList()) {
+					for (Revision.Builder rb : cb.getRevisionsBuilderList()) {
+						cb.addRevisionKeys(commitWriterLen);
+						bw = new BytesWritable(rb.build().toByteArray());
+						commitWriter.append(new LongWritable(commitWriterLen), bw);
+						commitWriterLen += bw.getLength();
+					}
+					cb.clearRevisions();
+				}
+				projectWriter.append(new Text(pb.getId()), new BytesWritable(pb.build().toByteArray()));
+			}
+		} catch (Throwable e) {
+			e.printStackTrace();
+			clear(writerPaths);
+			return;
+		}
+
+		System.out.println(projectName + " finished");
+		closeWriters();
+	}
+
+	private static void clear(String[] writerPaths) {
+		closeWriters();
+		// remove sequence files
+		for (String path : writerPaths) {
+			File file = new File(path);
+			if (file.exists())
+				org.apache.commons.io.FileUtils.deleteQuietly(file);
+		}
+	}
+
+	private static Project storeRepository(final Project project, final int i) {
+		final CodeRepository repo = project.getCodeRepositories(i); // this is an empty code repo
+		final Project.Builder projBuilder = Project.newBuilder(project);
+
+		final String name = project.getName();
+		File gitDir = new File(REPO_PATH + "/" + name);
+
+		// return null to skip empty project
+		if (isFiltered(project)) {
+			System.err.println(name + " is filtered");
+			return null;
+		}
+
+		AbstractConnector conn = null;
+		try {
+			conn = new GitConnector(gitDir.getAbsolutePath(), project.getName(), astWriter, astWriterLen, commitWriter,
+					commitWriterLen, contentWriter, contentWriterLen);
+			final CodeRepository.Builder repoBuilder = CodeRepository.newBuilder(repo);
+
+			List<Object> revisions = conn.getRevisions(project.getName());
+			if (!revisions.isEmpty()) {
+				if (revisions.get(0) instanceof Revision) {
+					// Approach 1: if the revision object is Revision, add it into the repoBuilder
+					for (final Object rev : revisions) {
+						final Revision.Builder revBuilder = Revision.newBuilder((Revision) rev);
+						repoBuilder.addRevisions(revBuilder);
+					}
+				} else {
+					// Approach 2: else save it as a key pointing to the Revision instance in the
+					// commit sequence file
+					for (final Object rev : revisions)
+						repoBuilder.addRevisionKeys((Long) rev);
+				}
+			}
+
+			// head commit indicates the latest commit which may not be in the default
+			// branch
+			repoBuilder.setHead(conn.getHeadCommitOffset());
+			repoBuilder.addAllHeadSnapshot(conn.buildHeadSnapshot());
+			repoBuilder.addAllBranches(conn.getBranchIndices());
+			repoBuilder.addAllBranchNames(conn.getBranchNames());
+			repoBuilder.addAllTags(conn.getTagIndices());
+			repoBuilder.addAllTagNames(conn.getTagNames());
+			projBuilder.setCodeRepositories(i, repoBuilder);
+
+			// return the completely builded project
+			return projBuilder.build();
+
+		} catch (final Throwable e) {
+			System.err.println("unknown error " + project.getName());
+			e.printStackTrace();
+		} finally {
+			if (conn != null) {
+				astWriterLen = conn.getAstWriterLen();
+				commitWriterLen = conn.getCommitWriterLen();
+				contentWriterLen = conn.getContentWriterLen();
+				try {
+					conn.close();
+				} catch (Exception e) {
+					System.err.println("Cannot close Git connector to " + gitDir.getAbsolutePath());
+					e.printStackTrace();
+				}
+			}
+		}
+
+		// return null to skip error project
+		return null;
+	}
+
+	private synchronized static boolean isFiltered(Project project) {
+		if (project.getForked())
+			return true;
+//		if (project.getStars() < 2 && project.getSize() < 100)
+//			return true;
+		if (project.getProgrammingLanguagesList().contains("Java")
+				|| project.getProgrammingLanguagesList().contains("JavaScript")
+				|| project.getProgrammingLanguagesList().contains("PHP"))
+			return false;
+		String lang = project.getMainLanguage();
+		if (lang != null && (lang.equals("Java") || lang.equals("JavaScript") || lang.equals("PHP")))
+			return false;
+		return true;
+	}
+
+	public static String[] openWriters(String projectName) {
+		suffix = projectName + ".seq";
+		while (true) {
+			try {
+				String projectWriterPath = OUTPUT_PATH + "/project/" + suffix;
+
+				// if the project is already processed return null
+				if (new File(projectWriterPath).exists())
+					return null;
+
+				projectWriter = SequenceFile.createWriter(fileSystem, conf, new Path(projectWriterPath), Text.class,
+						BytesWritable.class, CompressionType.BLOCK);
+
+				String astWriterPath = OUTPUT_PATH + "/ast/" + suffix;
+				astWriter = SequenceFile.createWriter(fileSystem, conf, new Path(astWriterPath), LongWritable.class,
+						BytesWritable.class, CompressionType.BLOCK);
+
+				String commitWriterPath = OUTPUT_PATH + "/commit/" + suffix;
+				commitWriter = SequenceFile.createWriter(fileSystem, conf, new Path(commitWriterPath),
+						LongWritable.class, BytesWritable.class, CompressionType.BLOCK);
+
+				String contentWriterPath = OUTPUT_PATH + "/source/" + suffix;
+				contentWriter = SequenceFile.createWriter(fileSystem, conf, new Path(contentWriterPath),
+						LongWritable.class, BytesWritable.class, CompressionType.BLOCK);
+
+				astWriterLen = 1;
+				commitWriterLen = 1;
+				contentWriterLen = 1;
+
+				return new String[] { projectWriterPath, astWriterPath, commitWriterPath, contentWriterPath };
+			} catch (Throwable t) {
+				t.printStackTrace();
+			}
+		}
+	}
+
+	public static void closeWriters() {
+		while (true) {
+			try {
+				projectWriter.close();
+				astWriter.close();
+				commitWriter.close();
+				contentWriter.close();
+				return;
+			} catch (Throwable t) {
+				t.printStackTrace();
+			}
+		}
+	}
+
+	private static List<String> getJsonFilePaths() {
+		List<String> jsonFilePaths = new ArrayList<String>();
+		BufferedReader reader;
+		try {
+			reader = new BufferedReader(new FileReader(JSON_FILES_PATH));
+			String line = reader.readLine();
+			while (line != null) {
+				jsonFilePaths.add(line);
+				line = reader.readLine();
+			}
+			reader.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		return jsonFilePaths;
+	}
+}
diff --git a/src/java/boa/datagen/slurm/SeqRepoCombiner.java b/src/java/boa/datagen/slurm/SeqRepoCombiner.java
new file mode 100644
index 000000000..67a9ed9ae
--- /dev/null
+++ b/src/java/boa/datagen/slurm/SeqRepoCombiner.java
@@ -0,0 +1,236 @@
+package boa.datagen.slurm;
+
+import java.io.EOFException;
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.DefaultCodec;
+
+import com.google.protobuf.CodedInputStream;
+
+import boa.types.Code.CodeRepository;
+import boa.types.Code.Revision;
+import boa.types.Diff.ChangedFile;
+import boa.types.Toplevel.Project;
+
+//Datagen Phase 3: combine sequence files
+public class SeqRepoCombiner {
+
+	private static String DATASET_PATH; // generated dataset in phase 2
+	private static int PROJECT_NUM_IN_AST; // maximum number of projects in each ast map
+
+	private static FileSystem fs;
+	private static Configuration conf;
+	private static SequenceFile.Writer projectWriter;
+	private static MapFile.Writer astWriter;
+	private static MapFile.Writer commitWriter;
+
+	public static void main(String[] args) {
+
+		if (args.length < 2) {
+			System.err.println("Need args:\n" + "DATASET_PATH\n" + "PROJECT_NUM_IN_AST\n");
+			return;
+		}
+
+		DATASET_PATH = args[0];
+		PROJECT_NUM_IN_AST = Integer.parseInt(args[1]);
+
+		try {
+			conf = new Configuration();
+			fs = FileSystem.get(conf);
+
+			// remove combined seq files
+			checkAndRemove(DATASET_PATH + "/combined");
+
+			int astMapSuffix = 0, fileCount = 0, projectCount = 0;
+			openWriters(astMapSuffix);
+
+			long lastAstWriterKey = 0, lastCommitWriterKey = 0;
+			HashSet<String> processedProjectNames = new HashSet<String>();
+			// iterate each directory
+			for (FileStatus file : fs.listStatus(new Path(DATASET_PATH + "/project"))) {
+				if (!file.isDir())
+					continue;
+				// iterate each seq file
+				for (FileStatus seqFile : fs.listStatus(file.getPath())) {
+					if (!seqFile.getPath().getName().endsWith(".seq"))
+						continue;
+					fileCount++;
+					String name = seqFile.getPath().getName();
+
+					SequenceFile.Reader r = null;
+					try {
+						System.out.println("Reading file " + fileCount + " : " + name);
+						r = new SequenceFile.Reader(fs, seqFile.getPath(), conf);
+						Text textKey = new Text();
+						BytesWritable value = new BytesWritable();
+
+						String projectName = null;
+						// each seq file should contain only one project
+						while (r.next(textKey, value)) {
+							Project p = Project
+									.parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
+							if (processedProjectNames.contains(p.getName()))
+								continue;
+							projectName = p.getName();
+							Project.Builder pb = Project.newBuilder(p);
+							for (CodeRepository.Builder crb : pb.getCodeRepositoriesBuilderList()) {
+								if (crb.getRevisionsCount() > 0) {
+									for (Revision.Builder rb : crb.getRevisionsBuilderList()) {
+										for (ChangedFile.Builder cfb : rb.getFilesBuilderList()) {
+											long key = cfb.getKey();
+											if (key > 0)
+												cfb.setKey(lastAstWriterKey + key);
+											cfb.setAstKey(astMapSuffix);
+										}
+									}
+								} else {
+									for (int j = 0; j < crb.getRevisionKeysCount(); j++) {
+										crb.setRevisionKeys(j, lastCommitWriterKey + crb.getRevisionKeys(j));
+									}
+								}
+								for (ChangedFile.Builder cfb : crb.getHeadSnapshotBuilderList()) {
+									long key = cfb.getKey();
+									if (key > 0)
+										cfb.setKey(lastAstWriterKey + key);
+									cfb.setAstKey(astMapSuffix);
+								}
+							}
+							projectWriter.append(textKey, new BytesWritable(pb.build().toByteArray()));
+							processedProjectNames.add(projectName);
+
+							// update its corresponding commit and ast seq
+							lastCommitWriterKey = readAndAppendCommit(conf, fs, commitWriter,
+									DATASET_PATH + "/commit/" + projectName + ".seq", lastAstWriterKey,
+									lastCommitWriterKey, astMapSuffix);
+							lastAstWriterKey = readAndAppendAst(conf, fs, astWriter,
+									DATASET_PATH + "/ast/" + projectName + ".seq", lastAstWriterKey);
+
+							System.out.println("Finish project " + projectName);
+
+							projectCount++;
+							// open a new ast writer if current writer hits the maximum project number
+							if (projectCount >= PROJECT_NUM_IN_AST) {
+								astMapSuffix++;
+								astWriter.close();
+								astWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/ast/map" + astMapSuffix,
+										LongWritable.class, BytesWritable.class, CompressionType.BLOCK,
+										new DefaultCodec(), null);
+								projectCount = 0;
+							}
+						}
+					} catch (EOFException e) {
+						e.printStackTrace();
+						System.err.println("ingore project " + name);
+						continue;
+					} catch (Exception e) {
+						e.printStackTrace();
+						System.err.println("ingore project " + name);
+						continue;
+					} finally {
+						if (r != null)
+							r.close();
+					}
+				}
+			}
+
+			closeWriters();
+			fs.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void openWriters(int astCount) {
+		CompressionType compType = CompressionType.BLOCK;
+		CompressionCodec compCode = new DefaultCodec();
+		try {
+			projectWriter = SequenceFile.createWriter(fs, conf, new Path(DATASET_PATH + "/combined/projects.seq"),
+					Text.class, BytesWritable.class, compType, compCode);
+			astWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/ast/map" + astCount, LongWritable.class,
+					BytesWritable.class, compType, compCode, null);
+			commitWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/commit", LongWritable.class,
+					BytesWritable.class, compType, compCode, null);
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+
+	public static void closeWriters() {
+		try {
+			projectWriter.close();
+			astWriter.close();
+			commitWriter.close();
+		} catch (Throwable t) {
+			t.printStackTrace();
+		}
+	}
+
+	private static void checkAndRemove(String path) {
+		File file = new File(path);
+		if (file.exists()) {
+			System.out.println("remove file " + path);
+			org.apache.commons.io.FileUtils.deleteQuietly(file);
+		}
+	}
+
+	public static long readAndAppendCommit(Configuration conf, FileSystem fileSystem, MapFile.Writer writer,
+			String fileName, long lastAstKey, long lastCommitKey, int astMapSuffix) throws IOException {
+		long newLastKey = lastCommitKey;
+		SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, new Path(fileName), conf);
+		LongWritable longKey = new LongWritable();
+		BytesWritable value = new BytesWritable();
+		try {
+			while (r.next(longKey, value)) {
+				newLastKey = longKey.get() + lastCommitKey;
+				Revision rev = Revision.parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
+				Revision.Builder rb = Revision.newBuilder(rev);
+				for (ChangedFile.Builder cfb : rb.getFilesBuilderList()) {
+					long key = cfb.getKey();
+					if (key > 0)
+						cfb.setKey(lastAstKey + key);
+					cfb.setAstKey(astMapSuffix);
+				}
+				writer.append(new LongWritable(newLastKey), new BytesWritable(rb.build().toByteArray()));
+			}
+		} catch (Exception e) {
+			System.err.println(fileName);
+			e.printStackTrace();
+		} finally {
+			r.close();
+		}
+		return newLastKey;
+	}
+
+	public static long readAndAppendAst(Configuration conf, FileSystem fileSystem, MapFile.Writer writer,
+			String fileName, long lastKey) throws IOException {
+		long newLastKey = lastKey;
+		SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, new Path(fileName), conf);
+		LongWritable longKey = new LongWritable();
+		BytesWritable value = new BytesWritable();
+		try {
+			while (r.next(longKey, value)) {
+				newLastKey = longKey.get() + lastKey;
+				writer.append(new LongWritable(newLastKey), value);
+			}
+		} catch (Exception e) {
+			System.err.println(fileName);
+			e.printStackTrace();
+		} finally {
+			r.close();
+		}
+		return newLastKey;
+	}
+}
diff --git a/src/java/boa/datagen/slurm/SeqRepoGenerator.java b/src/java/boa/datagen/slurm/SeqRepoGenerator.java
new file mode 100644
index 000000000..34486defd
--- /dev/null
+++ b/src/java/boa/datagen/slurm/SeqRepoGenerator.java
@@ -0,0 +1,112 @@
+package boa.datagen.slurm;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+//Datagen Phase 2: generate seq files for each bare repository
+public class SeqRepoGenerator {
+
+	private static String SLURM_JOB_TEMPLATE_PATH;
+	private static String JSON_INPUT_PATH; // The directory contains a list of repo json files
+	private static String REPO_INPUT_PATH; // The directory contains a list of bare repositories
+	private static String SPLIT_JSON_PATH;
+	private static String DATAGEN_JAR_PATH;
+	private static String OUTPUT_PATH;
+	private static int FILE_NUM_PER_JOB;
+
+	public static void main(String[] args) {
+
+		if (args.length < 7) {
+			System.err.println("Need args:\n" + "SLURM_JOB_TEMPLATE_PATH\n" + "JSON_INPUT_PATH\n" + "REPO_INPUT_PATH\n"
+					+ "SPLIT_JSON_PATH\n" + "DATAGEN_JAR_PATH\n" + "OUTPUT_PATH\n" + "FILE_NUM_PER_JOB");
+			return;
+		}
+
+		SLURM_JOB_TEMPLATE_PATH = args[0];
+		JSON_INPUT_PATH = args[1];
+		REPO_INPUT_PATH = args[2];
+		SPLIT_JSON_PATH = args[3];
+		DATAGEN_JAR_PATH = args[4];
+		OUTPUT_PATH = args[5];
+		FILE_NUM_PER_JOB = Integer.parseInt(args[6]);
+
+		// check split directory
+		File splitDir = new File(SPLIT_JSON_PATH);
+		if (splitDir.exists()) {
+			System.out.println("deleted " + splitDir.getAbsolutePath());
+			org.apache.commons.io.FileUtils.deleteQuietly(splitDir);
+		}
+		if (!splitDir.mkdir())
+			System.err.println("can't make directory " + splitDir.getAbsolutePath());
+		
+		// split json files
+		File input = new File(JSON_INPUT_PATH);
+		List<String> files = new ArrayList<String>();
+		int count = 0;
+		for (File file : input.listFiles()) {
+			if (file.getName().endsWith(".json")) {
+				files.add(file.getAbsolutePath());
+				if (files.size() == FILE_NUM_PER_JOB) {
+					write(files, count++);
+					files = new ArrayList<String>();
+				}
+			}
+		}
+		if (files.size() != 0)
+			write(files, count);
+		
+		// run slurm job
+		input = new File(SPLIT_JSON_PATH);
+		for (File file : input.listFiles())
+			runSlurmJob(file.getAbsolutePath());
+	}
+
+	private static void write(List<String> files, int count) {
+		StringBuilder sb = new StringBuilder();
+		for (String s : files)
+			sb.append(s + "\n");
+	    BufferedWriter writer;
+	    String path = SPLIT_JSON_PATH + "/" + count + ".txt";
+		try {
+			writer = new BufferedWriter(new FileWriter(path));
+			writer.write(sb.toString());
+			writer.close();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+
+	private static void runSlurmJob(String splitPath) {
+		Process p;
+		try {
+			List<String> cmdList = new ArrayList<String>();
+			cmdList.add("sbatch");
+//			cmdList.add("sh");
+			cmdList.add(SLURM_JOB_TEMPLATE_PATH);
+			// args
+			cmdList.add(DATAGEN_JAR_PATH); // 1st arg: datagen jar path
+			cmdList.add(REPO_INPUT_PATH); // 2nd arg: bare repo path
+			cmdList.add(splitPath); // 3rd arg: the file contain the paths of splited json files
+			cmdList.add(OUTPUT_PATH); // 4th arg: output path
+			ProcessBuilder pb = new ProcessBuilder(cmdList);
+			p = pb.start();
+
+			p.waitFor();
+			BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()));
+			String line;
+			while ((line = reader.readLine()) != null)
+				System.out.println(line);
+
+		} catch (IOException e) {
+			e.printStackTrace();
+		} catch (InterruptedException e) {
+			e.printStackTrace();
+		}
+	}
+}
diff --git a/src/java/boa/datagen/slurm/run-combiner.sh b/src/java/boa/datagen/slurm/run-combiner.sh
new file mode 100644
index 000000000..e4bd1aed0
--- /dev/null
+++ b/src/java/boa/datagen/slurm/run-combiner.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=24
+#SBATCH --mem=101G
+#SBATCH --time=30-02:30:02
+
+# ----- optional
+#SBATCH --output=job.%J.out
+#SBATCH --error=job.%J.err
+#SBATCH --job-name="combine"
+#SBATCH --partition=speedy
+
+# ----- load module 
+module load jdk
+
+JARFILE="./seq-repo-combiner.jar"
+RAM="-Xmx100G" # need to change accordingly
+
+# local test
+# OUTPUT_PATH="/Users/hyj/git/BoaData/DataSet/p3test"
+# PROJECT_NUM_IN_AST="1"
+
+# remote
+OUTPUT_PATH="/work/LAS/hridesh-lab/yijia/p3datagen/dataset_new"
+PROJECT_NUM_IN_AST="10000"
+
+# main
+CMD="java ${RAM} -Xss64M -jar \
+${JARFILE} \
+${OUTPUT_PATH} \
+${PROJECT_NUM_IN_AST}"
+
+echo "Execute: ${CMD}\n"
+${CMD}
\ No newline at end of file
diff --git a/src/java/boa/datagen/slurm/run-generator.sh b/src/java/boa/datagen/slurm/run-generator.sh
new file mode 100644
index 000000000..980bdffb9
--- /dev/null
+++ b/src/java/boa/datagen/slurm/run-generator.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=2G
+#SBATCH --time=30-02:30:02
+
+# ----- optional
+#SBATCH --output=job.%J.out
+#SBATCH --error=job.%J.err
+#SBATCH --job-name="run-gen"
+
+# ----- load module 
+module load jdk
+
+JARFILE="./seq-repo-generator.jar"
+RAM="-Xmx1G" # need to change accordingly
+
+# local test
+# SLURM_JOB_TEMPLATE_PATH="slurmJob.sh"
+# JSON_INPUT_PATH="/Users/hyj/git/BoaData/DataGenInputJson"
+# REPO_INPUT_PATH="/Users/hyj/git/BoaData/DataGenInputRepo"
+# SPLIT_JSON_PATH="split"
+# DATAGEN_JAR_PATH="seq-repo-builder.jar"
+# OUTPUT_PATH="/Users/hyj/git/BoaData/DataSet/p3test"
+# FILE_NUM_PER_JOB="1"
+
+# remote 
+SLURM_JOB_TEMPLATE_PATH="slurmJob.sh"
+JSON_INPUT_PATH="/work/LAS/hridesh-lab/longvu/2020_java_dataset/2020_java_json_sized"
+REPO_INPUT_PATH="/work/LAS/hridesh-lab/longvu/2020_java_dataset/input_repo_java"
+SPLIT_JSON_PATH="split"
+DATAGEN_JAR_PATH="seq-repo-builder.jar"
+OUTPUT_PATH="/work/LAS/hridesh-lab/yijia/p3datagen/dataset_new"
+FILE_NUM_PER_JOB="10"
+
+# main
+CMD="java ${RAM} -Xss64M -jar \
+${JARFILE} \
+${SLURM_JOB_TEMPLATE_PATH} \
+${JSON_INPUT_PATH} \
+${REPO_INPUT_PATH} \
+${SPLIT_JSON_PATH} \
+${DATAGEN_JAR_PATH} \
+${OUTPUT_PATH} \
+${FILE_NUM_PER_JOB}"
+
+echo "Execute: ${CMD}\n"
+${CMD}
\ No newline at end of file
diff --git a/src/java/boa/datagen/slurm/slurmJob.sh b/src/java/boa/datagen/slurm/slurmJob.sh
new file mode 100644
index 000000000..21e3f61fa
--- /dev/null
+++ b/src/java/boa/datagen/slurm/slurmJob.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=25G
+#SBATCH --time=30-02:30:02
+
+# ----- optional
+#SBATCH --output=job.%J.out
+#SBATCH --error=job.%J.err
+#SBATCH --job-name="datagen"
+
+# ----- load module 
+module load jdk
+
+# ----- main
+# $1: datagen jar path
+# $2: bare repo path
+# $3: json files in a list
+# $4: output path
+JARFILE="${1}"
+REPO="${2}"
+JSON_FILES="${3}"
+OUTPUT="${4}"
+RAM="-Xmx24G" # need to change accordingly
+
+# ----- run
+CMD="java ${RAM} -Xss64M -jar \
+${JARFILE} \
+${REPO} \
+${JSON_FILES} \
+${OUTPUT}"
+
+echo "Execute: ${CMD}"
+${CMD}
\ No newline at end of file
diff --git a/src/java/boa/functions/BoaAstIntrinsics.java b/src/java/boa/functions/BoaAstIntrinsics.java
index c55d673e9..b905d6103 100644
--- a/src/java/boa/functions/BoaAstIntrinsics.java
+++ b/src/java/boa/functions/BoaAstIntrinsics.java
@@ -69,6 +69,7 @@
 public class BoaAstIntrinsics {
 	@SuppressWarnings("rawtypes")
 	static Context context;
+	private static int curMapSuffix = -1; // only used if the dataset contains multiple ast maps
 	private static MapFile.Reader map, commentsMap, issuesMap;
 
 	private static final Revision emptyRevision;
@@ -126,8 +127,13 @@ public static ASTRoot getast(final ChangedFile f) {
 
 		context.getCounter(ASTCOUNTER.GETS_ATTEMPTED).increment(1);
 
-		if (map == null)
-			openMap();
+		// current open map is null OR current ast key doesn't match the one of the chagned file 
+		if (map == null || (curMapSuffix != -1 && curMapSuffix != f.getAstKey())) {
+			if (!f.hasAstKey())
+				openMap();
+			else
+				openMap(f.getAstKey());
+		}
 
 		try {
 			final BytesWritable value = new BytesWritable();
@@ -302,6 +308,31 @@ private static void openMap() {
 		}
 	}
 
+	private static void openMap(int mapSuffix) {
+		try {
+			final Configuration conf = context.getConfiguration();
+			final FileSystem fs;
+			final Path p;
+			if (DefaultProperties.localDataPath != null) {
+				p = new Path(DefaultProperties.localDataPath, "ast/map" + mapSuffix);
+				fs = FileSystem.getLocal(conf);
+			} else {
+				p = new Path(
+					context.getConfiguration().get("fs.default.name", "hdfs://boa-njt/"),
+					new Path(
+						conf.get("boa.ast.dir", conf.get("boa.input.dir", "repcache/live")),
+						new Path("ast/map" + mapSuffix)
+					)
+				);
+				fs = FileSystem.get(conf);
+			}
+			map = new MapFile.Reader(fs, p.toString(), conf);
+			curMapSuffix = mapSuffix;
+		} catch (final Exception e) {
+			e.printStackTrace();
+		}
+	}
+
 	private static void openCommentMap() {
 		try {
 			final Configuration conf = context.getConfiguration();
diff --git a/src/java/boa/runtime/BoaPartitioner.java b/src/java/boa/runtime/BoaPartitioner.java
index 695a689ab..8c250c440 100644
--- a/src/java/boa/runtime/BoaPartitioner.java
+++ b/src/java/boa/runtime/BoaPartitioner.java
@@ -32,7 +32,7 @@ public class BoaPartitioner extends Partitioner<EmitKey, EmitValue> {
 	private static String[] outputVariableNames = new String[0];
 
 	public int getPartition(final EmitKey key, final EmitValue value, final int num) {
-		return getPartitionForVariable(key.getName());
+		return getPartitionForVariable(key.getName()) % num;
 	}
 
 	public static void setVariableNames(final String[] names) {
diff --git a/src/proto/diff.proto b/src/proto/diff.proto
index a1b3f7660..0f04b5e09 100644
--- a/src/proto/diff.proto
+++ b/src/proto/diff.proto
@@ -147,4 +147,6 @@ message ChangedFile {
 	repeated int32 previous_versions = 9;
 	/** @exclude The indices of the previous files in the list of changed files of the corresponding parent commits */
 	repeated int32 previous_indices = 10;
+	
+	optional int32 ast_key = 11;
 }