diff --git a/src/compiled-proto/boa/types/Diff.java b/src/compiled-proto/boa/types/Diff.java index 2967dd8b1..11e1d9cd7 100644 --- a/src/compiled-proto/boa/types/Diff.java +++ b/src/compiled-proto/boa/types/Diff.java @@ -249,6 +249,16 @@ public interface ChangedFileOrBuilder * */ int getPreviousIndices(int index); + + // optional int32 ast_key = 11; + /** + * optional int32 ast_key = 11; + */ + boolean hasAstKey(); + /** + * optional int32 ast_key = 11; + */ + int getAstKey(); } /** * Protobuf type {@code boa.types.ChangedFile} @@ -438,6 +448,11 @@ private ChangedFile( input.popLimit(limit); break; } + case 88: { + bitField0_ |= 0x00000040; + astKey_ = input.readInt32(); + break; + } } } } catch (com.google.protobuf.InvalidProtocolBufferException e) { @@ -1780,6 +1795,22 @@ public int getPreviousIndices(int index) { return previousIndices_.get(index); } + // optional int32 ast_key = 11; + public static final int AST_KEY_FIELD_NUMBER = 11; + private int astKey_; + /** + * optional int32 ast_key = 11; + */ + public boolean hasAstKey() { + return ((bitField0_ & 0x00000040) == 0x00000040); + } + /** + * optional int32 ast_key = 11; + */ + public int getAstKey() { + return astKey_; + } + private void initFields() { change_ = boa.types.Shared.ChangeKind.UNKNOWN; kind_ = boa.types.Diff.ChangedFile.FileKind.OTHER; @@ -1791,6 +1822,7 @@ private void initFields() { previousNames_ = com.google.protobuf.LazyStringArrayList.EMPTY; previousVersions_ = java.util.Collections.emptyList(); previousIndices_ = java.util.Collections.emptyList(); + astKey_ = 0; } private byte memoizedIsInitialized = -1; public final boolean isInitialized() { @@ -1860,6 +1892,9 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) for (int i = 0; i < previousIndices_.size(); i++) { output.writeInt32(10, previousIndices_.get(i)); } + if (((bitField0_ & 0x00000040) == 0x00000040)) { + output.writeInt32(11, astKey_); + } getUnknownFields().writeTo(output); } @@ -1929,6 +1964,10 @@ public int getSerializedSize() { size += dataSize; size += 1 * getPreviousIndicesList().size(); } + if (((bitField0_ & 0x00000040) == 0x00000040)) { + size += com.google.protobuf.CodedOutputStream + .computeInt32Size(11, astKey_); + } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; return size; @@ -2074,6 +2113,8 @@ public Builder clear() { bitField0_ = (bitField0_ & ~0x00000100); previousIndices_ = java.util.Collections.emptyList(); bitField0_ = (bitField0_ & ~0x00000200); + astKey_ = 0; + bitField0_ = (bitField0_ & ~0x00000400); return this; } @@ -2151,6 +2192,10 @@ public boa.types.Diff.ChangedFile buildPartial() { bitField0_ = (bitField0_ & ~0x00000200); } result.previousIndices_ = previousIndices_; + if (((from_bitField0_ & 0x00000400) == 0x00000400)) { + to_bitField0_ |= 0x00000040; + } + result.astKey_ = astKey_; result.bitField0_ = to_bitField0_; onBuilt(); return result; @@ -2227,6 +2272,9 @@ public Builder mergeFrom(boa.types.Diff.ChangedFile other) { } onChanged(); } + if (other.hasAstKey()) { + setAstKey(other.getAstKey()); + } this.mergeUnknownFields(other.getUnknownFields()); return this; } @@ -3150,6 +3198,39 @@ public Builder clearPreviousIndices() { return this; } + // optional int32 ast_key = 11; + private int astKey_ ; + /** + * optional int32 ast_key = 11; + */ + public boolean hasAstKey() { + return ((bitField0_ & 0x00000400) == 0x00000400); + } + /** + * optional int32 ast_key = 11; + */ + public int getAstKey() { + return astKey_; + } + /** + * optional int32 ast_key = 11; + */ + public Builder setAstKey(int value) { + bitField0_ |= 0x00000400; + astKey_ = value; + onChanged(); + return this; + } + /** + * optional int32 ast_key = 11; + */ + public Builder clearAstKey() { + bitField0_ = (bitField0_ & ~0x00000400); + astKey_ = 0; + onChanged(); + return this; + } + // @@protoc_insertion_point(builder_scope:boa.types.ChangedFile) } @@ -3176,38 +3257,38 @@ public Builder clearPreviousIndices() { static { java.lang.String[] descriptorData = { "\n\ndiff.proto\022\tboa.types\032\014shared.proto\032\ta" + - "st.proto\"\325\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" + + "st.proto\"\346\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" + "2\025.boa.types.ChangeKind\022-\n\004kind\030\002 \002(\0162\037." + "boa.types.ChangedFile.FileKind\022\014\n\004name\030\003" + " \002(\t\022\013\n\003key\030\004 \002(\004\022\013\n\003ast\030\005 \002(\010\022)\n\010commen" + "ts\030\006 \001(\0132\027.boa.types.CommentsRoot\022&\n\007cha" + "nges\030\007 \003(\0162\025.boa.types.ChangeKind\022\026\n\016pre" + "vious_names\030\010 \003(\t\022\031\n\021previous_versions\030\t" + - " \003(\005\022\030\n\020previous_indices\030\n \003(\005\"\247\007\n\010FileK" + - "ind\022\t\n\005OTHER\020\000\022\n\n\006BINARY\020\001\022\010\n\004TEXT\020\002\022\007\n\003", - "XML\020\003\022\025\n\021SOURCE_JAVA_ERROR\020d\022\024\n\020SOURCE_J" + - "AVA_JLS2\020f\022\024\n\020SOURCE_JAVA_JLS3\020g\022\024\n\020SOUR" + - "CE_JAVA_JLS4\020h\022\024\n\020SOURCE_JAVA_JLS8\020l\022\016\n\n" + - "JAVA_ERROR\020d\022\010\n\004JLS2\020f\022\010\n\004JLS3\020g\022\010\n\004JLS4" + - "\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_CS_ERROR\020\310\001\022\022\n\rSO" + - "URCE_CS_CS1\020\311\001\022\022\n\rSOURCE_CS_CS2\020\312\001\022\022\n\rSO" + - "URCE_CS_CS3\020\313\001\022\022\n\rSOURCE_CS_CS4\020\314\001\022\022\n\rSO" + - "URCE_CS_CS5\020\315\001\022\r\n\010CS_ERROR\020\310\001\022\010\n\003CS1\020\311\001\022" + - "\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003CS4\020\314\001\022\010\n\003CS5\020\315\001\022" + - "\024\n\017SOURCE_JS_ERROR\020\254\002\022\022\n\rSOURCE_JS_ES1\020\255", - "\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022\n\rSOURCE_JS_ES3\020\257" + - "\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022\n\rSOURCE_JS_ES6\020\261" + - "\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022\n\rSOURCE_JS_ES8\020\263" + - "\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOURCE_PHP_ERROR\020\220\003\022" + - "\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSOURCE_PHP5_3\020\222\003\022\022\n" + - "\rSOURCE_PHP5_4\020\223\003\022\022\n\rSOURCE_PHP5_5\020\224\003\022\022\n" + - "\rSOURCE_PHP5_6\020\225\003\022\022\n\rSOURCE_PHP7_0\020\226\003\022\022\n" + - "\rSOURCE_PHP7_1\020\227\003\022\016\n\tPHP_ERROR\020\220\003\022\026\n\021SOU" + - "RCE_HTML_ERROR\020\364\003\022\020\n\013Source_HTML\020\365\003\022\017\n\nH" + - "TML_ERROR\020\364\003\022\025\n\020SOURCE_XML_ERROR\020\330\004\022\017\n\nS", - "ource_XML\020\331\004\022\016\n\tXML_ERROR\020\330\004\022\025\n\020SOURCE_C" + - "SS_ERROR\020\274\005\022\017\n\nSource_CSS\020\275\005\022\016\n\tCSS_ERRO" + - "R\020\274\005\032\002\020\001B\002H\001" + " \003(\005\022\030\n\020previous_indices\030\n \003(\005\022\017\n\007ast_ke" + + "y\030\013 \001(\005\"\247\007\n\010FileKind\022\t\n\005OTHER\020\000\022\n\n\006BINAR", + "Y\020\001\022\010\n\004TEXT\020\002\022\007\n\003XML\020\003\022\025\n\021SOURCE_JAVA_ER" + + "ROR\020d\022\024\n\020SOURCE_JAVA_JLS2\020f\022\024\n\020SOURCE_JA" + + "VA_JLS3\020g\022\024\n\020SOURCE_JAVA_JLS4\020h\022\024\n\020SOURC" + + "E_JAVA_JLS8\020l\022\016\n\nJAVA_ERROR\020d\022\010\n\004JLS2\020f\022" + + "\010\n\004JLS3\020g\022\010\n\004JLS4\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_" + + "CS_ERROR\020\310\001\022\022\n\rSOURCE_CS_CS1\020\311\001\022\022\n\rSOURC" + + "E_CS_CS2\020\312\001\022\022\n\rSOURCE_CS_CS3\020\313\001\022\022\n\rSOURC" + + "E_CS_CS4\020\314\001\022\022\n\rSOURCE_CS_CS5\020\315\001\022\r\n\010CS_ER" + + "ROR\020\310\001\022\010\n\003CS1\020\311\001\022\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003" + + "CS4\020\314\001\022\010\n\003CS5\020\315\001\022\024\n\017SOURCE_JS_ERROR\020\254\002\022\022", + "\n\rSOURCE_JS_ES1\020\255\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022" + + "\n\rSOURCE_JS_ES3\020\257\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022" + + "\n\rSOURCE_JS_ES6\020\261\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022" + + "\n\rSOURCE_JS_ES8\020\263\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOU" + + "RCE_PHP_ERROR\020\220\003\022\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSO" + + "URCE_PHP5_3\020\222\003\022\022\n\rSOURCE_PHP5_4\020\223\003\022\022\n\rSO" + + "URCE_PHP5_5\020\224\003\022\022\n\rSOURCE_PHP5_6\020\225\003\022\022\n\rSO" + + "URCE_PHP7_0\020\226\003\022\022\n\rSOURCE_PHP7_1\020\227\003\022\016\n\tPH" + + "P_ERROR\020\220\003\022\026\n\021SOURCE_HTML_ERROR\020\364\003\022\020\n\013So" + + "urce_HTML\020\365\003\022\017\n\nHTML_ERROR\020\364\003\022\025\n\020SOURCE_", + "XML_ERROR\020\330\004\022\017\n\nSource_XML\020\331\004\022\016\n\tXML_ERR" + + "OR\020\330\004\022\025\n\020SOURCE_CSS_ERROR\020\274\005\022\017\n\nSource_C" + + "SS\020\275\005\022\016\n\tCSS_ERROR\020\274\005\032\002\020\001B\002H\001" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -3219,7 +3300,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors( internal_static_boa_types_ChangedFile_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_boa_types_ChangedFile_descriptor, - new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", }); + new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", "AstKey", }); return null; } }; diff --git a/src/java/boa/datagen/forges/github/GetReposByLanguage.java b/src/java/boa/datagen/forges/github/GetReposByLanguage.java index 4c5dd5ba8..8eeef9135 100644 --- a/src/java/boa/datagen/forges/github/GetReposByLanguage.java +++ b/src/java/boa/datagen/forges/github/GetReposByLanguage.java @@ -1,26 +1,49 @@ package boa.datagen.forges.github; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.util.ArrayList; import java.util.Calendar; import java.util.Date; +import java.util.HashSet; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; -import boa.datagen.util.FileIO; +import boa.datagen.util.FileIO; public class GetReposByLanguage { - + + // GET PROJECT WITH STARS LARGER OR EQUAL TO THIS NUMBER + static int stars = 0; + static long start, stop = 0; + public static void main(String[] args) { + + if (args.length < 4) { + System.out.println("args: TOKEN_FILE_INPUT_PATH, OUTPUT_PATH, STARS, LANGS"); + return; + } + TokenList tokens = new TokenList(args[0]); String outDir = args[1]; - String[] languages = { "java" }; - if (args.length > 2) { - languages = new String[args.length - 2]; - for (int i = 2; i < args.length; i++) - languages[i - 2] = args[i]; + stars = Integer.parseInt(args[2]); + String[] languages = args[3].split(";"); + + if (args.length > 3) { + String langArgs = ""; + for (int i = 3; i < args.length; i++) { + langArgs += args[i]; + } + languages = langArgs.split(","); + for (int i = 0; i < languages.length; i++) + languages[i] = languages[i].trim(); } Thread[] workers = new Thread[languages.length]; for (int i = 0; i < languages.length; i++) { @@ -35,91 +58,115 @@ public static void main(String[] args) { e.printStackTrace(); } } - - + public static class Worker implements Runnable { - private final int id; + private final int id; private final String language; private TokenList tokens; private final String outDir; private JsonArray repos = new JsonArray(); private final int RECORDS_PER_FILE = 100; private int counter = 0; - + private HashSet processedRepID = new HashSet<>(); + private ArrayList IDtoWrite = new ArrayList<>(); + public Worker(int id, String language, String outDir, TokenList tokenList) { this.id = id; this.language = language; this.outDir = outDir; this.tokens = tokenList; + File processedRepos = new File(outDir + "/" + language + "processed.txt"); + try { + processedRepos.createNewFile(); + Files.lines(processedRepos.toPath()).forEach(repID -> processedRepID.add(Integer.parseInt(repID))); + } catch (IOException e) { + e.printStackTrace(); + } } - + @Override public void run() { + start = System.currentTimeMillis(); Calendar cal = Calendar.getInstance(); cal.setTime(new Date()); cal.add(Calendar.DATE, 1); int year = cal.get(Calendar.YEAR); int month = cal.get(Calendar.MONTH) + 1; // month starts from 0 int day = cal.get(Calendar.DAY_OF_MONTH); + + String monthString = month < 10 ? "0" + month : String.valueOf(month); + String dayString = day < 10 ? "0" + day : String.valueOf(day); + String time = year + "-" + monthString + "-" + dayString + "T23:59:59Z"; + // String time = "2018-12-21T01:01:01Z"; - String time = year + "-" + month + "-" + day + "T23:59:59Z"; +// String time = year + "-" + month + "-" + day + "T23:59:59Z"; Gson parser = new Gson(); - while (true){ - Token tok = this.tokens.getNextAuthenticToken("https://api.github.com/repositories"); - String url = "https://api.github.com/search/repositories?q=language:" + language +"+stars:>1+pushed:<=" + time + "&sort=updated&order=desc&per_page=100"; - System.out.println(url); - MetadataCacher mc = new MetadataCacher(url, tok.getUserName(), tok.getToken()); - mc.authenticate(); - while (!mc.isAuthenticated() || mc.getNumberOfRemainingLimit() <= 0) { - try { - Thread.sleep(1000); - } catch (InterruptedException e1) { - e1.printStackTrace(); - } - mc = new MetadataCacher(url, tok.getUserName(), tok.getToken()); - mc.authenticate(); - } - mc.getResponseJson(); - String content = mc.getContent(); + Token tokenGetAPI = this.tokens.getNextAuthenticTokenM("https://api.github.com"); + Token tokenSearch = this.tokens.getNextAuthenticTokenM("https://api.github.com/search/repositories?q=language"); + MetadataCacher mcGetAPI = new MetadataCacher("https://api.github.com/repositories", tokenGetAPI.getUserName(), tokenGetAPI.getToken()); + mcGetAPI.authenticate(); + while (true) { + String searchURL = "https://api.github.com/search/repositories?q=language:" + language + "+stars:>=" + stars + + "+pushed:<=" + time + "&sort=updated&order=desc&per_page=100"; + System.out.println(searchURL); + MetadataCacher mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken()); + mcSearch.authenticate(); + // We don't need to check for remaining limit for search because 30 requests/1 min is way faster than we could process, could be + // useful if in the future we can find a way to get 100 repos in 2 secs. +// while (mcSearch.getNumberOfRemainingLimit() <= 0) { +// System.out.println("user: " + tokenSearch.getUserName() + " limit: " + mcSearch.getNumberOfRemainingLimit()); +// tokenSearch = this.tokens.getNextAuthenticToken(searchURL); +// mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken()); +// } + mcSearch.getResponseJson(); + String content = mcSearch.getContent(); + JsonObject json = null; json = parser.fromJson(content, JsonElement.class).getAsJsonObject(); - JsonArray items = json.getAsJsonArray("items"); - if (items.size() > 0) { - for (int j = 0; j < items.size(); j++) { - JsonObject item = items.get(j).getAsJsonObject(); - this.addRepo(item); - String pushed = item.get("pushed_at").getAsString(); - if (pushed.compareTo(time) < 0){ - time = pushed; - } - } - } - int count = json.get("total_count").getAsInt(); - if (count == items.size()) - break; - if (tok.getNumberOfRemainingLimit() <= 1) { - long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis(); - if (t >= 0) { - System.out.println("Waiting " + (t/1000) + " seconds for sending more requests."); - try { - Thread.sleep(t); - } catch (InterruptedException e) { - e.printStackTrace(); + JsonArray items = json.getAsJsonArray("items"); + + if (items.size() > 0) { + int getLimit = mcGetAPI.getNumberOfRemainingLimit(); + System.out.println("Get API rate limit remaining: " + getLimit); + + if (getLimit < items.size() + 1 || !mcGetAPI.isAuthenticated()) { + tokenGetAPI = this.tokens.getNextAuthenticToken("https://api.github.com/repositories", items.size() + 2); // 1 for getting authentic token, 1 for authenticating language link + mcGetAPI = new MetadataCacher(mcGetAPI.getUrl(), tokenGetAPI.getUserName(), tokenGetAPI.getToken()); + } + for (int j = 0; j < items.size(); j++) { + JsonObject item = items.get(j).getAsJsonObject(); + // check if repository is already saved + int repID = item.get("id").getAsInt(); + if (!processedRepID.contains(repID)) { + mcGetAPI = addLanguageToRepo(item, parser, mcGetAPI); + + this.addRepo(item); + processedRepID.add(repID); + IDtoWrite.add(repID); + } else { + System.out.println(repID + " already written"); + } + String pushed = item.get("pushed_at").getAsString(); + if (pushed.compareTo(time) < 0) { + time = pushed; } } + System.out.println(" "); } + int count = json.get("total_count").getAsInt(); // count will not be static + if (count == items.size()) + break; } writeRemainingRepos(); - } - + } + private void addRepo(JsonObject repo) { File fileToWriteJson = null; this.repos.add(repo); if (this.repos.size() % RECORDS_PER_FILE == 0) { - fileToWriteJson = new File( - outDir + "/Thread-" + this.id + "-page-" + counter + ".json"); + fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json"); while (fileToWriteJson.exists()) { System.out.println(fileToWriteJson.getAbsolutePath() + " arleady exist"); counter++; @@ -128,6 +175,20 @@ private void addRepo(JsonObject repo) { FileIO.writeFileContents(fileToWriteJson, this.repos.toString()); System.out.println(Thread.currentThread().getId() + " " + counter++); this.repos = new JsonArray(); + try (FileWriter fw = new FileWriter(outDir + "/" + language + "processed.txt", true); + BufferedWriter bw = new BufferedWriter(fw); + PrintWriter out = new PrintWriter(bw)) { + for (Integer repID : IDtoWrite) { + out.println(repID.intValue()); + } + } catch (IOException e) { + e.printStackTrace(); + } finally { + IDtoWrite.clear(); + } + stop = System.currentTimeMillis(); + System.out.println("Time taken: " + (stop - start) / 1000.0 + "seconds"); + start = stop; } } @@ -140,8 +201,25 @@ public void writeRemainingRepos() { fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json"); } FileIO.writeFileContents(fileToWriteJson, this.repos.toString()); - System.out.println(this.id + counter++); + System.out.println(this.id + counter++); } } + + // Returning metadataCacher so that we don't have to authenticate again to get remaining limit + private MetadataCacher addLanguageToRepo(JsonObject repo, Gson parser, MetadataCacher mc) { + String langurl = "https://api.github.com/repos/" + repo.get("full_name").getAsString() + "/languages"; + mc = new MetadataCacher(langurl, mc.getUsername(), mc.getPassword()); + if (mc.authenticate()) { + mc.getResponse(); + String pageContent = mc.getContent(); + JsonObject languages = parser.fromJson(pageContent, JsonElement.class).getAsJsonObject(); + repo.add("language_list", languages); + } else { + final int responsecode = mc.getResponseCode(); + System.err.println("authentication error " + responsecode); + } + return mc; + } + } -} +} \ No newline at end of file diff --git a/src/java/boa/datagen/forges/github/MetadataCacher.java b/src/java/boa/datagen/forges/github/MetadataCacher.java index 5966292bc..ef4be5ede 100644 --- a/src/java/boa/datagen/forges/github/MetadataCacher.java +++ b/src/java/boa/datagen/forges/github/MetadataCacher.java @@ -44,14 +44,26 @@ public void setUrl(String url) { e.printStackTrace(); } } + + public String getUserName() { + return this.username; + } public void setUsername(String username) { this.username = username; } + + public String getUsername() { + return username; + } public void setPassword(String password) { this.password = password; } + + public String getPassword() { + return password; + } public boolean isAuthenticated() { return authenticated; @@ -79,6 +91,7 @@ public boolean authenticate(String username, String password) { } catch (IOException e) { // considered as failed } +// System.out.println("authenticate: " + this.authenticated); return this.authenticated; } @@ -151,10 +164,14 @@ public int getNumberOfMaxLimit() { } public int getNumberOfRemainingLimit() { - return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining")); + try { + return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining")); + } catch(NumberFormatException e) { + return -1; + } } public long getLimitResetTime() { return Long.parseLong(this.connection.getHeaderField("X-RateLimit-Reset")); } -} +} \ No newline at end of file diff --git a/src/java/boa/datagen/forges/github/TokenList.java b/src/java/boa/datagen/forges/github/TokenList.java index dca3a7e1c..6907e7d95 100644 --- a/src/java/boa/datagen/forges/github/TokenList.java +++ b/src/java/boa/datagen/forges/github/TokenList.java @@ -33,17 +33,18 @@ public TokenList(String path) { } } + public Token getNextAuthenticToken(String url) { MetadataCacher mc = null; while (true) { for (Token token : tokens) { - // System.out.println("Trying token: " + token.getId()); mc = new MetadataCacher(url, token.getUserName(), token.getToken()); - if (mc.authenticate()) { + if (mc.authenticate() && mc.getNumberOfRemainingLimit() >= 1) { if (this.lastUsedToken != token.getId()) { this.lastUsedToken = token.getId(); - System.out.println("now using token: " + token.getId()); +// System.out.println("now using token: " + token.getId()); } +// System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName()); return token; } } @@ -57,6 +58,105 @@ public Token getNextAuthenticToken(String url) { // throw new IllegalArgumentException(); } + + public MetadataCacher getNextAuthenticMetadataCacher(String url) { + MetadataCacher mc = null; + while (true) { + for (Token token : tokens) { + System.out.println("Trying token " + token.getId()); + mc = new MetadataCacher(url, token.getUserName(), token.getToken()); + if (mc.authenticate()) { + if (this.lastUsedToken != token.getId()) { + this.lastUsedToken = token.getId(); +// System.out.println("now using token: " + token.getId()); + } + System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName()); + return mc; + } + // the web is 404 + if (mc.getNumberOfRemainingLimit() >= 1) + return null; + } + try { + System.out.println("waiting for token, going to sleep for 10s"); + Thread.sleep(10000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + + // throw new IllegalArgumentException(); + } + + public Token getNextAuthenticTokenM(String url) { + MetadataCacher mc = null; + while (true) { + for (Token token : tokens) { + mc = new MetadataCacher(url, token.getUserName(), token.getToken()); + System.out.println("Trying token: " + token.getUserName() + " for " + url.substring(0, Math.min(url.length(), 40)) + ((url.length() < 40) ? "" : "...")); + mc.authenticate(); + System.out.println(mc.getUsername() + " " + mc.isAuthenticated()); + if (mc.getNumberOfRemainingLimit() >= 0) { + System.out.println(mc.getNumberOfRemainingLimit()); + if (this.lastUsedToken != token.getId()) { + this.lastUsedToken = token.getId(); + System.out.println("now using token: " + token.getUserName()); + } + return token; + } + } + try { +// long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis(); +// if (t >= 0) { // could be useful if json is created too fast +// System.out.println("Waiting " + (t / 1000) + " seconds for sending more requests."); +// try { +// Thread.sleep(t); +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } +// } + System.out.println("waiting for token, going to sleep for 10s"); + Thread.sleep(10000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + // throw new IllegalArgumentException(); + } + + + public Token getNextAuthenticToken(String url, int minRateLimit) { + MetadataCacher mc = null; + while (true) { + for (Token token : tokens) { + System.out.println("Trying token: " + token.getUserName() + " "); + mc = new MetadataCacher(url, token.getUserName(), token.getToken()); + mc.authenticate(); + if (mc.isAuthenticated()) { + int limitRemaining = mc.getNumberOfRemainingLimit(); + if (limitRemaining < minRateLimit) { + System.out.println("Authenticated but have " + limitRemaining + " than min rate limit of " + minRateLimit); + continue; + } + if (this.lastUsedToken != token.getId()) { + this.lastUsedToken = token.getId(); + System.out.println("Now using token: " + token.getUserName() + " "); + } + System.out.println(mc.getNumberOfRemainingLimit()); + return token; + } else { + System.err.println("Can't authenticate, response code:" + mc.getResponseCode()); + } + } + try { + System.out.println("waiting for token, going to sleep for 10s"); + Thread.sleep(10000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + // throw new IllegalArgumentException(); + } public synchronized Token getAuthenticatedToken(long threadId) { while (true) { @@ -83,4 +183,4 @@ public synchronized void removeToken(Token tok) { public synchronized void addToken(Token tok) { this.tokens.add(tok); } -} +} \ No newline at end of file diff --git a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java similarity index 65% rename from src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java rename to src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java index b0e435b4b..2c59b6452 100644 --- a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java +++ b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java @@ -1,4 +1,4 @@ -package boa.datagen.forges.github; +package boa.datagen.slurm; import java.io.File; import org.eclipse.jgit.api.Git; @@ -8,6 +8,7 @@ import boa.datagen.util.FileIO; +// Datagen Phase 1: download all bare repositories public class GitHubRepoBareDownloader { private static String INPUT_PATH; // The directory contains a list of repo json files @@ -19,62 +20,63 @@ public static void main(String[] args) { if (args.length < 3) { System.out.println("args: INPUT_NAMES_PATH, OUTPUT_REPOS_PATH, THREAD_NUM"); - } else { - INPUT_PATH = args[0]; - OUTPUT_REPOS_PATH = args[1]; - THREAD_NUM = Integer.parseInt(args[2]); - - File input = new File(INPUT_PATH); - - DownloadWorker[] workers = new DownloadWorker[THREAD_NUM]; - Thread[] threads = new Thread[THREAD_NUM]; - for (int i = 0; i < THREAD_NUM; i++) { - workers[i] = new DownloadWorker(i); - threads[i] = new Thread(workers[i]); - threads[i].start(); - } + return; + } - // assign tasks to workers - for (File file : input.listFiles()) { - if (!file.getName().endsWith(".json")) - continue; - JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file)); - for (JsonElement je : jsonTree.getAsJsonArray()) { - String projectName = je.getAsJsonObject().get("html_url").getAsString() - .replace("https://github.com/", ""); - - boolean assigned = false; - while (!assigned) { - for (int j = 0; j < THREAD_NUM; j++) { - if (workers[j].isReady()) { - workers[j].setName(projectName); - workers[j].setReady(false); - assigned = true; - break; - } - } - try { - Thread.sleep(100); - } catch (InterruptedException e) { - e.printStackTrace(); + INPUT_PATH = args[0]; + OUTPUT_REPOS_PATH = args[1]; + THREAD_NUM = Integer.parseInt(args[2]); + + File input = new File(INPUT_PATH); + + DownloadWorker[] workers = new DownloadWorker[THREAD_NUM]; + Thread[] threads = new Thread[THREAD_NUM]; + for (int i = 0; i < THREAD_NUM; i++) { + workers[i] = new DownloadWorker(i); + threads[i] = new Thread(workers[i]); + threads[i].start(); + } + + // assign tasks to workers + for (File file : input.listFiles()) { + if (!file.getName().endsWith(".json")) + continue; + JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file)); + for (JsonElement je : jsonTree.getAsJsonArray()) { + String projectName = je.getAsJsonObject().get("html_url").getAsString().replace("https://github.com/", + ""); + + boolean assigned = false; + while (!assigned) { + for (int j = 0; j < THREAD_NUM; j++) { + if (workers[j].isReady()) { + workers[j].setName(projectName); + workers[j].setReady(false); + assigned = true; + break; } } - } - } - - // wait for all done - for (int j = 0; j < THREAD_NUM; j++) { - while (!workers[j].isReady()) try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace(); } + } } + } - setDone(true); + // wait for all done + for (int j = 0; j < THREAD_NUM; j++) { + while (!workers[j].isReady()) + try { + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } } + setDone(true); + } synchronized static boolean getDone() { @@ -138,7 +140,7 @@ private void runJob() { result.getRepository().close(); } } else { - System.out.println("repo " + projectName + "already exists"); + System.out.println("repo " + projectName + " already exists"); } } diff --git a/src/java/boa/datagen/slurm/SeqRepoBuilder.java b/src/java/boa/datagen/slurm/SeqRepoBuilder.java new file mode 100644 index 000000000..9c182ca73 --- /dev/null +++ b/src/java/boa/datagen/slurm/SeqRepoBuilder.java @@ -0,0 +1,304 @@ +package boa.datagen.slurm; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; + +import boa.datagen.DefaultProperties; +import boa.datagen.forges.github.RepoMetadata; +import boa.datagen.scm.AbstractConnector; +import boa.datagen.scm.GitConnector; +import boa.datagen.util.FileIO; +import boa.types.Code.CodeRepository; +import boa.types.Code.Revision; +import boa.types.Toplevel.Project; + +//Datagen Phase 2: DATAGEN_JAR used for each slurm job +public class SeqRepoBuilder { + + private static String REPO_PATH; + private static String JSON_FILES_PATH; + private static String OUTPUT_PATH; + + private static Configuration conf = null; + private static FileSystem fileSystem = null; + + private static String suffix; + private static SequenceFile.Writer projectWriter, astWriter, commitWriter, contentWriter; + private static long astWriterLen = 1, commitWriterLen = 1, contentWriterLen = 1; + + private static int MAX_COMMITS = Integer.valueOf(DefaultProperties.MAX_SIZE_FOR_PROJECT_WITH_COMMITS);; + + public static void main(String[] args) throws IOException { + + if (args.length < 3) { + System.err.println("Need args:\n" + "REPO_PATH\n" + "JSON_FILES_PATH\n" + "OUTPUT_PATH\n"); + return; + } + + REPO_PATH = args[0]; + JSON_FILES_PATH = args[1]; + OUTPUT_PATH = args[2]; + + conf = new Configuration(); + fileSystem = FileSystem.get(conf); + boa.datagen.DefaultProperties.DEBUG = true; + + int counter = 0; + for (String jsonFilePath : getJsonFilePaths()) { + File file = new File(jsonFilePath); + String content = FileIO.readFileContents(file); + Gson parser = new Gson(); + JsonArray repoArray = null; + try { + repoArray = parser.fromJson(content, JsonElement.class).getAsJsonArray(); + } catch (Exception e) { + System.err.println("Error proccessing page: " + file.getPath()); + e.printStackTrace(); + continue; + } + // iterate each json object (project metadata) in the json array + for (int i = 0; i < repoArray.size(); i++) { + JsonObject rp = repoArray.get(i).getAsJsonObject(); + RepoMetadata repo = new RepoMetadata(rp); + if (repo.id != null && repo.name != null) { + System.out.println("Processing the " + (++counter) + "th project: " + repo.name); + // generate seq files for this project + Project project = repo.toBoaMetaDataProtobuf(); + process(project); + } + } + } + + // done + } + + private static void process(Project project) { + String projectName = project.getName(); + String[] writerPaths = openWriters(projectName); + + // if writerPaths is null, then the project is processed. + if (writerPaths == null) { + System.out.println(projectName + " seq file is already existing"); + return; + } + + try { + project = storeRepository(project, 0); + // if the project is null then skip this project + if (project == null) { + System.out.println(projectName + " is null skip this"); + clear(writerPaths); + return; + } + + // store project into sequence file + BytesWritable bw = new BytesWritable(project.toByteArray()); + if (bw.getLength() <= MAX_COMMITS || (project.getCodeRepositoriesCount() > 0 + && project.getCodeRepositories(0).getRevisionKeysCount() > 0)) { + // Approach 1: if the Project size is acceptable, then directly append the + // Project instance into the sequence file + projectWriter.append(new Text(project.getId()), bw); + } else { + // Approach 2: if the size is too large, extract Commit instances and append + // them into commit sequence file. + Project.Builder pb = Project.newBuilder(project); + for (CodeRepository.Builder cb : pb.getCodeRepositoriesBuilderList()) { + for (Revision.Builder rb : cb.getRevisionsBuilderList()) { + cb.addRevisionKeys(commitWriterLen); + bw = new BytesWritable(rb.build().toByteArray()); + commitWriter.append(new LongWritable(commitWriterLen), bw); + commitWriterLen += bw.getLength(); + } + cb.clearRevisions(); + } + projectWriter.append(new Text(pb.getId()), new BytesWritable(pb.build().toByteArray())); + } + } catch (Throwable e) { + e.printStackTrace(); + clear(writerPaths); + return; + } + + System.out.println(projectName + " finished"); + closeWriters(); + } + + private static void clear(String[] writerPaths) { + closeWriters(); + // remove sequence files + for (String path : writerPaths) { + File file = new File(path); + if (file.exists()) + org.apache.commons.io.FileUtils.deleteQuietly(file); + } + } + + private static Project storeRepository(final Project project, final int i) { + final CodeRepository repo = project.getCodeRepositories(i); // this is an empty code repo + final Project.Builder projBuilder = Project.newBuilder(project); + + final String name = project.getName(); + File gitDir = new File(REPO_PATH + "/" + name); + + // return null to skip empty project + if (isFiltered(project)) { + System.err.println(name + " is filtered"); + return null; + } + + AbstractConnector conn = null; + try { + conn = new GitConnector(gitDir.getAbsolutePath(), project.getName(), astWriter, astWriterLen, commitWriter, + commitWriterLen, contentWriter, contentWriterLen); + final CodeRepository.Builder repoBuilder = CodeRepository.newBuilder(repo); + + List revisions = conn.getRevisions(project.getName()); + if (!revisions.isEmpty()) { + if (revisions.get(0) instanceof Revision) { + // Approach 1: if the revision object is Revision, add it into the repoBuilder + for (final Object rev : revisions) { + final Revision.Builder revBuilder = Revision.newBuilder((Revision) rev); + repoBuilder.addRevisions(revBuilder); + } + } else { + // Approach 2: else save it as a key pointing to the Revision instance in the + // commit sequence file + for (final Object rev : revisions) + repoBuilder.addRevisionKeys((Long) rev); + } + } + + // head commit indicates the latest commit which may not be in the default + // branch + repoBuilder.setHead(conn.getHeadCommitOffset()); + repoBuilder.addAllHeadSnapshot(conn.buildHeadSnapshot()); + repoBuilder.addAllBranches(conn.getBranchIndices()); + repoBuilder.addAllBranchNames(conn.getBranchNames()); + repoBuilder.addAllTags(conn.getTagIndices()); + repoBuilder.addAllTagNames(conn.getTagNames()); + projBuilder.setCodeRepositories(i, repoBuilder); + + // return the completely builded project + return projBuilder.build(); + + } catch (final Throwable e) { + System.err.println("unknown error " + project.getName()); + e.printStackTrace(); + } finally { + if (conn != null) { + astWriterLen = conn.getAstWriterLen(); + commitWriterLen = conn.getCommitWriterLen(); + contentWriterLen = conn.getContentWriterLen(); + try { + conn.close(); + } catch (Exception e) { + System.err.println("Cannot close Git connector to " + gitDir.getAbsolutePath()); + e.printStackTrace(); + } + } + } + + // return null to skip error project + return null; + } + + private synchronized static boolean isFiltered(Project project) { + if (project.getForked()) + return true; +// if (project.getStars() < 2 && project.getSize() < 100) +// return true; + if (project.getProgrammingLanguagesList().contains("Java") + || project.getProgrammingLanguagesList().contains("JavaScript") + || project.getProgrammingLanguagesList().contains("PHP")) + return false; + String lang = project.getMainLanguage(); + if (lang != null && (lang.equals("Java") || lang.equals("JavaScript") || lang.equals("PHP"))) + return false; + return true; + } + + public static String[] openWriters(String projectName) { + suffix = projectName + ".seq"; + while (true) { + try { + String projectWriterPath = OUTPUT_PATH + "/project/" + suffix; + + // if the project is already processed return null + if (new File(projectWriterPath).exists()) + return null; + + projectWriter = SequenceFile.createWriter(fileSystem, conf, new Path(projectWriterPath), Text.class, + BytesWritable.class, CompressionType.BLOCK); + + String astWriterPath = OUTPUT_PATH + "/ast/" + suffix; + astWriter = SequenceFile.createWriter(fileSystem, conf, new Path(astWriterPath), LongWritable.class, + BytesWritable.class, CompressionType.BLOCK); + + String commitWriterPath = OUTPUT_PATH + "/commit/" + suffix; + commitWriter = SequenceFile.createWriter(fileSystem, conf, new Path(commitWriterPath), + LongWritable.class, BytesWritable.class, CompressionType.BLOCK); + + String contentWriterPath = OUTPUT_PATH + "/source/" + suffix; + contentWriter = SequenceFile.createWriter(fileSystem, conf, new Path(contentWriterPath), + LongWritable.class, BytesWritable.class, CompressionType.BLOCK); + + astWriterLen = 1; + commitWriterLen = 1; + contentWriterLen = 1; + + return new String[] { projectWriterPath, astWriterPath, commitWriterPath, contentWriterPath }; + } catch (Throwable t) { + t.printStackTrace(); + } + } + } + + public static void closeWriters() { + while (true) { + try { + projectWriter.close(); + astWriter.close(); + commitWriter.close(); + contentWriter.close(); + return; + } catch (Throwable t) { + t.printStackTrace(); + } + } + } + + private static List getJsonFilePaths() { + List jsonFilePaths = new ArrayList(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(JSON_FILES_PATH)); + String line = reader.readLine(); + while (line != null) { + jsonFilePaths.add(line); + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + return jsonFilePaths; + } +} diff --git a/src/java/boa/datagen/slurm/SeqRepoCombiner.java b/src/java/boa/datagen/slurm/SeqRepoCombiner.java new file mode 100644 index 000000000..67a9ed9ae --- /dev/null +++ b/src/java/boa/datagen/slurm/SeqRepoCombiner.java @@ -0,0 +1,236 @@ +package boa.datagen.slurm; + +import java.io.EOFException; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.DefaultCodec; + +import com.google.protobuf.CodedInputStream; + +import boa.types.Code.CodeRepository; +import boa.types.Code.Revision; +import boa.types.Diff.ChangedFile; +import boa.types.Toplevel.Project; + +//Datagen Phase 3: combine sequence files +public class SeqRepoCombiner { + + private static String DATASET_PATH; // generated dataset in phase 2 + private static int PROJECT_NUM_IN_AST; // maximum number of projects in each ast map + + private static FileSystem fs; + private static Configuration conf; + private static SequenceFile.Writer projectWriter; + private static MapFile.Writer astWriter; + private static MapFile.Writer commitWriter; + + public static void main(String[] args) { + + if (args.length < 2) { + System.err.println("Need args:\n" + "DATASET_PATH\n" + "PROJECT_NUM_IN_AST\n"); + return; + } + + DATASET_PATH = args[0]; + PROJECT_NUM_IN_AST = Integer.parseInt(args[1]); + + try { + conf = new Configuration(); + fs = FileSystem.get(conf); + + // remove combined seq files + checkAndRemove(DATASET_PATH + "/combined"); + + int astMapSuffix = 0, fileCount = 0, projectCount = 0; + openWriters(astMapSuffix); + + long lastAstWriterKey = 0, lastCommitWriterKey = 0; + HashSet processedProjectNames = new HashSet(); + // iterate each directory + for (FileStatus file : fs.listStatus(new Path(DATASET_PATH + "/project"))) { + if (!file.isDir()) + continue; + // iterate each seq file + for (FileStatus seqFile : fs.listStatus(file.getPath())) { + if (!seqFile.getPath().getName().endsWith(".seq")) + continue; + fileCount++; + String name = seqFile.getPath().getName(); + + SequenceFile.Reader r = null; + try { + System.out.println("Reading file " + fileCount + " : " + name); + r = new SequenceFile.Reader(fs, seqFile.getPath(), conf); + Text textKey = new Text(); + BytesWritable value = new BytesWritable(); + + String projectName = null; + // each seq file should contain only one project + while (r.next(textKey, value)) { + Project p = Project + .parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength())); + if (processedProjectNames.contains(p.getName())) + continue; + projectName = p.getName(); + Project.Builder pb = Project.newBuilder(p); + for (CodeRepository.Builder crb : pb.getCodeRepositoriesBuilderList()) { + if (crb.getRevisionsCount() > 0) { + for (Revision.Builder rb : crb.getRevisionsBuilderList()) { + for (ChangedFile.Builder cfb : rb.getFilesBuilderList()) { + long key = cfb.getKey(); + if (key > 0) + cfb.setKey(lastAstWriterKey + key); + cfb.setAstKey(astMapSuffix); + } + } + } else { + for (int j = 0; j < crb.getRevisionKeysCount(); j++) { + crb.setRevisionKeys(j, lastCommitWriterKey + crb.getRevisionKeys(j)); + } + } + for (ChangedFile.Builder cfb : crb.getHeadSnapshotBuilderList()) { + long key = cfb.getKey(); + if (key > 0) + cfb.setKey(lastAstWriterKey + key); + cfb.setAstKey(astMapSuffix); + } + } + projectWriter.append(textKey, new BytesWritable(pb.build().toByteArray())); + processedProjectNames.add(projectName); + + // update its corresponding commit and ast seq + lastCommitWriterKey = readAndAppendCommit(conf, fs, commitWriter, + DATASET_PATH + "/commit/" + projectName + ".seq", lastAstWriterKey, + lastCommitWriterKey, astMapSuffix); + lastAstWriterKey = readAndAppendAst(conf, fs, astWriter, + DATASET_PATH + "/ast/" + projectName + ".seq", lastAstWriterKey); + + System.out.println("Finish project " + projectName); + + projectCount++; + // open a new ast writer if current writer hits the maximum project number + if (projectCount >= PROJECT_NUM_IN_AST) { + astMapSuffix++; + astWriter.close(); + astWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/ast/map" + astMapSuffix, + LongWritable.class, BytesWritable.class, CompressionType.BLOCK, + new DefaultCodec(), null); + projectCount = 0; + } + } + } catch (EOFException e) { + e.printStackTrace(); + System.err.println("ingore project " + name); + continue; + } catch (Exception e) { + e.printStackTrace(); + System.err.println("ingore project " + name); + continue; + } finally { + if (r != null) + r.close(); + } + } + } + + closeWriters(); + fs.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void openWriters(int astCount) { + CompressionType compType = CompressionType.BLOCK; + CompressionCodec compCode = new DefaultCodec(); + try { + projectWriter = SequenceFile.createWriter(fs, conf, new Path(DATASET_PATH + "/combined/projects.seq"), + Text.class, BytesWritable.class, compType, compCode); + astWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/ast/map" + astCount, LongWritable.class, + BytesWritable.class, compType, compCode, null); + commitWriter = new MapFile.Writer(conf, fs, DATASET_PATH + "/combined/commit", LongWritable.class, + BytesWritable.class, compType, compCode, null); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public static void closeWriters() { + try { + projectWriter.close(); + astWriter.close(); + commitWriter.close(); + } catch (Throwable t) { + t.printStackTrace(); + } + } + + private static void checkAndRemove(String path) { + File file = new File(path); + if (file.exists()) { + System.out.println("remove file " + path); + org.apache.commons.io.FileUtils.deleteQuietly(file); + } + } + + public static long readAndAppendCommit(Configuration conf, FileSystem fileSystem, MapFile.Writer writer, + String fileName, long lastAstKey, long lastCommitKey, int astMapSuffix) throws IOException { + long newLastKey = lastCommitKey; + SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, new Path(fileName), conf); + LongWritable longKey = new LongWritable(); + BytesWritable value = new BytesWritable(); + try { + while (r.next(longKey, value)) { + newLastKey = longKey.get() + lastCommitKey; + Revision rev = Revision.parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength())); + Revision.Builder rb = Revision.newBuilder(rev); + for (ChangedFile.Builder cfb : rb.getFilesBuilderList()) { + long key = cfb.getKey(); + if (key > 0) + cfb.setKey(lastAstKey + key); + cfb.setAstKey(astMapSuffix); + } + writer.append(new LongWritable(newLastKey), new BytesWritable(rb.build().toByteArray())); + } + } catch (Exception e) { + System.err.println(fileName); + e.printStackTrace(); + } finally { + r.close(); + } + return newLastKey; + } + + public static long readAndAppendAst(Configuration conf, FileSystem fileSystem, MapFile.Writer writer, + String fileName, long lastKey) throws IOException { + long newLastKey = lastKey; + SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, new Path(fileName), conf); + LongWritable longKey = new LongWritable(); + BytesWritable value = new BytesWritable(); + try { + while (r.next(longKey, value)) { + newLastKey = longKey.get() + lastKey; + writer.append(new LongWritable(newLastKey), value); + } + } catch (Exception e) { + System.err.println(fileName); + e.printStackTrace(); + } finally { + r.close(); + } + return newLastKey; + } +} diff --git a/src/java/boa/datagen/slurm/SeqRepoGenerator.java b/src/java/boa/datagen/slurm/SeqRepoGenerator.java new file mode 100644 index 000000000..34486defd --- /dev/null +++ b/src/java/boa/datagen/slurm/SeqRepoGenerator.java @@ -0,0 +1,112 @@ +package boa.datagen.slurm; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +//Datagen Phase 2: generate seq files for each bare repository +public class SeqRepoGenerator { + + private static String SLURM_JOB_TEMPLATE_PATH; + private static String JSON_INPUT_PATH; // The directory contains a list of repo json files + private static String REPO_INPUT_PATH; // The directory contains a list of bare repositories + private static String SPLIT_JSON_PATH; + private static String DATAGEN_JAR_PATH; + private static String OUTPUT_PATH; + private static int FILE_NUM_PER_JOB; + + public static void main(String[] args) { + + if (args.length < 7) { + System.err.println("Need args:\n" + "SLURM_JOB_TEMPLATE_PATH\n" + "JSON_INPUT_PATH\n" + "REPO_INPUT_PATH\n" + + "SPLIT_JSON_PATH\n" + "DATAGEN_JAR_PATH\n" + "OUTPUT_PATH\n" + "FILE_NUM_PER_JOB"); + return; + } + + SLURM_JOB_TEMPLATE_PATH = args[0]; + JSON_INPUT_PATH = args[1]; + REPO_INPUT_PATH = args[2]; + SPLIT_JSON_PATH = args[3]; + DATAGEN_JAR_PATH = args[4]; + OUTPUT_PATH = args[5]; + FILE_NUM_PER_JOB = Integer.parseInt(args[6]); + + // check split directory + File splitDir = new File(SPLIT_JSON_PATH); + if (splitDir.exists()) { + System.out.println("deleted " + splitDir.getAbsolutePath()); + org.apache.commons.io.FileUtils.deleteQuietly(splitDir); + } + if (!splitDir.mkdir()) + System.err.println("can't make directory " + splitDir.getAbsolutePath()); + + // split json files + File input = new File(JSON_INPUT_PATH); + List files = new ArrayList(); + int count = 0; + for (File file : input.listFiles()) { + if (file.getName().endsWith(".json")) { + files.add(file.getAbsolutePath()); + if (files.size() == FILE_NUM_PER_JOB) { + write(files, count++); + files = new ArrayList(); + } + } + } + if (files.size() != 0) + write(files, count); + + // run slurm job + input = new File(SPLIT_JSON_PATH); + for (File file : input.listFiles()) + runSlurmJob(file.getAbsolutePath()); + } + + private static void write(List files, int count) { + StringBuilder sb = new StringBuilder(); + for (String s : files) + sb.append(s + "\n"); + BufferedWriter writer; + String path = SPLIT_JSON_PATH + "/" + count + ".txt"; + try { + writer = new BufferedWriter(new FileWriter(path)); + writer.write(sb.toString()); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void runSlurmJob(String splitPath) { + Process p; + try { + List cmdList = new ArrayList(); + cmdList.add("sbatch"); +// cmdList.add("sh"); + cmdList.add(SLURM_JOB_TEMPLATE_PATH); + // args + cmdList.add(DATAGEN_JAR_PATH); // 1st arg: datagen jar path + cmdList.add(REPO_INPUT_PATH); // 2nd arg: bare repo path + cmdList.add(splitPath); // 3rd arg: the file contain the paths of splited json files + cmdList.add(OUTPUT_PATH); // 4th arg: output path + ProcessBuilder pb = new ProcessBuilder(cmdList); + p = pb.start(); + + p.waitFor(); + BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream())); + String line; + while ((line = reader.readLine()) != null) + System.out.println(line); + + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } +} diff --git a/src/java/boa/datagen/slurm/run-combiner.sh b/src/java/boa/datagen/slurm/run-combiner.sh new file mode 100644 index 000000000..e4bd1aed0 --- /dev/null +++ b/src/java/boa/datagen/slurm/run-combiner.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=24 +#SBATCH --mem=101G +#SBATCH --time=30-02:30:02 + +# ----- optional +#SBATCH --output=job.%J.out +#SBATCH --error=job.%J.err +#SBATCH --job-name="combine" +#SBATCH --partition=speedy + +# ----- load module +module load jdk + +JARFILE="./seq-repo-combiner.jar" +RAM="-Xmx100G" # need to change accordingly + +# local test +# OUTPUT_PATH="/Users/hyj/git/BoaData/DataSet/p3test" +# PROJECT_NUM_IN_AST="1" + +# remote +OUTPUT_PATH="/work/LAS/hridesh-lab/yijia/p3datagen/dataset_new" +PROJECT_NUM_IN_AST="10000" + +# main +CMD="java ${RAM} -Xss64M -jar \ +${JARFILE} \ +${OUTPUT_PATH} \ +${PROJECT_NUM_IN_AST}" + +echo "Execute: ${CMD}\n" +${CMD} \ No newline at end of file diff --git a/src/java/boa/datagen/slurm/run-generator.sh b/src/java/boa/datagen/slurm/run-generator.sh new file mode 100644 index 000000000..980bdffb9 --- /dev/null +++ b/src/java/boa/datagen/slurm/run-generator.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=2G +#SBATCH --time=30-02:30:02 + +# ----- optional +#SBATCH --output=job.%J.out +#SBATCH --error=job.%J.err +#SBATCH --job-name="run-gen" + +# ----- load module +module load jdk + +JARFILE="./seq-repo-generator.jar" +RAM="-Xmx1G" # need to change accordingly + +# local test +# SLURM_JOB_TEMPLATE_PATH="slurmJob.sh" +# JSON_INPUT_PATH="/Users/hyj/git/BoaData/DataGenInputJson" +# REPO_INPUT_PATH="/Users/hyj/git/BoaData/DataGenInputRepo" +# SPLIT_JSON_PATH="split" +# DATAGEN_JAR_PATH="seq-repo-builder.jar" +# OUTPUT_PATH="/Users/hyj/git/BoaData/DataSet/p3test" +# FILE_NUM_PER_JOB="1" + +# remote +SLURM_JOB_TEMPLATE_PATH="slurmJob.sh" +JSON_INPUT_PATH="/work/LAS/hridesh-lab/longvu/2020_java_dataset/2020_java_json_sized" +REPO_INPUT_PATH="/work/LAS/hridesh-lab/longvu/2020_java_dataset/input_repo_java" +SPLIT_JSON_PATH="split" +DATAGEN_JAR_PATH="seq-repo-builder.jar" +OUTPUT_PATH="/work/LAS/hridesh-lab/yijia/p3datagen/dataset_new" +FILE_NUM_PER_JOB="10" + +# main +CMD="java ${RAM} -Xss64M -jar \ +${JARFILE} \ +${SLURM_JOB_TEMPLATE_PATH} \ +${JSON_INPUT_PATH} \ +${REPO_INPUT_PATH} \ +${SPLIT_JSON_PATH} \ +${DATAGEN_JAR_PATH} \ +${OUTPUT_PATH} \ +${FILE_NUM_PER_JOB}" + +echo "Execute: ${CMD}\n" +${CMD} \ No newline at end of file diff --git a/src/java/boa/datagen/slurm/slurmJob.sh b/src/java/boa/datagen/slurm/slurmJob.sh new file mode 100644 index 000000000..21e3f61fa --- /dev/null +++ b/src/java/boa/datagen/slurm/slurmJob.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=25G +#SBATCH --time=30-02:30:02 + +# ----- optional +#SBATCH --output=job.%J.out +#SBATCH --error=job.%J.err +#SBATCH --job-name="datagen" + +# ----- load module +module load jdk + +# ----- main +# $1: datagen jar path +# $2: bare repo path +# $3: json files in a list +# $4: output path +JARFILE="${1}" +REPO="${2}" +JSON_FILES="${3}" +OUTPUT="${4}" +RAM="-Xmx24G" # need to change accordingly + +# ----- run +CMD="java ${RAM} -Xss64M -jar \ +${JARFILE} \ +${REPO} \ +${JSON_FILES} \ +${OUTPUT}" + +echo "Execute: ${CMD}" +${CMD} \ No newline at end of file diff --git a/src/java/boa/functions/BoaAstIntrinsics.java b/src/java/boa/functions/BoaAstIntrinsics.java index c55d673e9..b905d6103 100644 --- a/src/java/boa/functions/BoaAstIntrinsics.java +++ b/src/java/boa/functions/BoaAstIntrinsics.java @@ -69,6 +69,7 @@ public class BoaAstIntrinsics { @SuppressWarnings("rawtypes") static Context context; + private static int curMapSuffix = -1; // only used if the dataset contains multiple ast maps private static MapFile.Reader map, commentsMap, issuesMap; private static final Revision emptyRevision; @@ -126,8 +127,13 @@ public static ASTRoot getast(final ChangedFile f) { context.getCounter(ASTCOUNTER.GETS_ATTEMPTED).increment(1); - if (map == null) - openMap(); + // current open map is null OR current ast key doesn't match the one of the chagned file + if (map == null || (curMapSuffix != -1 && curMapSuffix != f.getAstKey())) { + if (!f.hasAstKey()) + openMap(); + else + openMap(f.getAstKey()); + } try { final BytesWritable value = new BytesWritable(); @@ -302,6 +308,31 @@ private static void openMap() { } } + private static void openMap(int mapSuffix) { + try { + final Configuration conf = context.getConfiguration(); + final FileSystem fs; + final Path p; + if (DefaultProperties.localDataPath != null) { + p = new Path(DefaultProperties.localDataPath, "ast/map" + mapSuffix); + fs = FileSystem.getLocal(conf); + } else { + p = new Path( + context.getConfiguration().get("fs.default.name", "hdfs://boa-njt/"), + new Path( + conf.get("boa.ast.dir", conf.get("boa.input.dir", "repcache/live")), + new Path("ast/map" + mapSuffix) + ) + ); + fs = FileSystem.get(conf); + } + map = new MapFile.Reader(fs, p.toString(), conf); + curMapSuffix = mapSuffix; + } catch (final Exception e) { + e.printStackTrace(); + } + } + private static void openCommentMap() { try { final Configuration conf = context.getConfiguration(); diff --git a/src/java/boa/runtime/BoaPartitioner.java b/src/java/boa/runtime/BoaPartitioner.java index 695a689ab..8c250c440 100644 --- a/src/java/boa/runtime/BoaPartitioner.java +++ b/src/java/boa/runtime/BoaPartitioner.java @@ -32,7 +32,7 @@ public class BoaPartitioner extends Partitioner { private static String[] outputVariableNames = new String[0]; public int getPartition(final EmitKey key, final EmitValue value, final int num) { - return getPartitionForVariable(key.getName()); + return getPartitionForVariable(key.getName()) % num; } public static void setVariableNames(final String[] names) { diff --git a/src/proto/diff.proto b/src/proto/diff.proto index a1b3f7660..0f04b5e09 100644 --- a/src/proto/diff.proto +++ b/src/proto/diff.proto @@ -147,4 +147,6 @@ message ChangedFile { repeated int32 previous_versions = 9; /** @exclude The indices of the previous files in the list of changed files of the corresponding parent commits */ repeated int32 previous_indices = 10; + + optional int32 ast_key = 11; }