diff --git a/src/compiled-proto/boa/types/Diff.java b/src/compiled-proto/boa/types/Diff.java
index 2967dd8b1..11e1d9cd7 100644
--- a/src/compiled-proto/boa/types/Diff.java
+++ b/src/compiled-proto/boa/types/Diff.java
@@ -249,6 +249,16 @@ public interface ChangedFileOrBuilder
*
*/
int getPreviousIndices(int index);
+
+ // optional int32 ast_key = 11;
+ /**
+ * optional int32 ast_key = 11;
+ */
+ boolean hasAstKey();
+ /**
+ * optional int32 ast_key = 11;
+ */
+ int getAstKey();
}
/**
* Protobuf type {@code boa.types.ChangedFile}
@@ -438,6 +448,11 @@ private ChangedFile(
input.popLimit(limit);
break;
}
+ case 88: {
+ bitField0_ |= 0x00000040;
+ astKey_ = input.readInt32();
+ break;
+ }
}
}
} catch (com.google.protobuf.InvalidProtocolBufferException e) {
@@ -1780,6 +1795,22 @@ public int getPreviousIndices(int index) {
return previousIndices_.get(index);
}
+ // optional int32 ast_key = 11;
+ public static final int AST_KEY_FIELD_NUMBER = 11;
+ private int astKey_;
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public boolean hasAstKey() {
+ return ((bitField0_ & 0x00000040) == 0x00000040);
+ }
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public int getAstKey() {
+ return astKey_;
+ }
+
private void initFields() {
change_ = boa.types.Shared.ChangeKind.UNKNOWN;
kind_ = boa.types.Diff.ChangedFile.FileKind.OTHER;
@@ -1791,6 +1822,7 @@ private void initFields() {
previousNames_ = com.google.protobuf.LazyStringArrayList.EMPTY;
previousVersions_ = java.util.Collections.emptyList();
previousIndices_ = java.util.Collections.emptyList();
+ astKey_ = 0;
}
private byte memoizedIsInitialized = -1;
public final boolean isInitialized() {
@@ -1860,6 +1892,9 @@ public void writeTo(com.google.protobuf.CodedOutputStream output)
for (int i = 0; i < previousIndices_.size(); i++) {
output.writeInt32(10, previousIndices_.get(i));
}
+ if (((bitField0_ & 0x00000040) == 0x00000040)) {
+ output.writeInt32(11, astKey_);
+ }
getUnknownFields().writeTo(output);
}
@@ -1929,6 +1964,10 @@ public int getSerializedSize() {
size += dataSize;
size += 1 * getPreviousIndicesList().size();
}
+ if (((bitField0_ & 0x00000040) == 0x00000040)) {
+ size += com.google.protobuf.CodedOutputStream
+ .computeInt32Size(11, astKey_);
+ }
size += getUnknownFields().getSerializedSize();
memoizedSerializedSize = size;
return size;
@@ -2074,6 +2113,8 @@ public Builder clear() {
bitField0_ = (bitField0_ & ~0x00000100);
previousIndices_ = java.util.Collections.emptyList();
bitField0_ = (bitField0_ & ~0x00000200);
+ astKey_ = 0;
+ bitField0_ = (bitField0_ & ~0x00000400);
return this;
}
@@ -2151,6 +2192,10 @@ public boa.types.Diff.ChangedFile buildPartial() {
bitField0_ = (bitField0_ & ~0x00000200);
}
result.previousIndices_ = previousIndices_;
+ if (((from_bitField0_ & 0x00000400) == 0x00000400)) {
+ to_bitField0_ |= 0x00000040;
+ }
+ result.astKey_ = astKey_;
result.bitField0_ = to_bitField0_;
onBuilt();
return result;
@@ -2227,6 +2272,9 @@ public Builder mergeFrom(boa.types.Diff.ChangedFile other) {
}
onChanged();
}
+ if (other.hasAstKey()) {
+ setAstKey(other.getAstKey());
+ }
this.mergeUnknownFields(other.getUnknownFields());
return this;
}
@@ -3150,6 +3198,39 @@ public Builder clearPreviousIndices() {
return this;
}
+ // optional int32 ast_key = 11;
+ private int astKey_ ;
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public boolean hasAstKey() {
+ return ((bitField0_ & 0x00000400) == 0x00000400);
+ }
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public int getAstKey() {
+ return astKey_;
+ }
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public Builder setAstKey(int value) {
+ bitField0_ |= 0x00000400;
+ astKey_ = value;
+ onChanged();
+ return this;
+ }
+ /**
+ * optional int32 ast_key = 11;
+ */
+ public Builder clearAstKey() {
+ bitField0_ = (bitField0_ & ~0x00000400);
+ astKey_ = 0;
+ onChanged();
+ return this;
+ }
+
// @@protoc_insertion_point(builder_scope:boa.types.ChangedFile)
}
@@ -3176,38 +3257,38 @@ public Builder clearPreviousIndices() {
static {
java.lang.String[] descriptorData = {
"\n\ndiff.proto\022\tboa.types\032\014shared.proto\032\ta" +
- "st.proto\"\325\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" +
+ "st.proto\"\346\t\n\013ChangedFile\022%\n\006change\030\001 \002(\016" +
"2\025.boa.types.ChangeKind\022-\n\004kind\030\002 \002(\0162\037." +
"boa.types.ChangedFile.FileKind\022\014\n\004name\030\003" +
" \002(\t\022\013\n\003key\030\004 \002(\004\022\013\n\003ast\030\005 \002(\010\022)\n\010commen" +
"ts\030\006 \001(\0132\027.boa.types.CommentsRoot\022&\n\007cha" +
"nges\030\007 \003(\0162\025.boa.types.ChangeKind\022\026\n\016pre" +
"vious_names\030\010 \003(\t\022\031\n\021previous_versions\030\t" +
- " \003(\005\022\030\n\020previous_indices\030\n \003(\005\"\247\007\n\010FileK" +
- "ind\022\t\n\005OTHER\020\000\022\n\n\006BINARY\020\001\022\010\n\004TEXT\020\002\022\007\n\003",
- "XML\020\003\022\025\n\021SOURCE_JAVA_ERROR\020d\022\024\n\020SOURCE_J" +
- "AVA_JLS2\020f\022\024\n\020SOURCE_JAVA_JLS3\020g\022\024\n\020SOUR" +
- "CE_JAVA_JLS4\020h\022\024\n\020SOURCE_JAVA_JLS8\020l\022\016\n\n" +
- "JAVA_ERROR\020d\022\010\n\004JLS2\020f\022\010\n\004JLS3\020g\022\010\n\004JLS4" +
- "\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_CS_ERROR\020\310\001\022\022\n\rSO" +
- "URCE_CS_CS1\020\311\001\022\022\n\rSOURCE_CS_CS2\020\312\001\022\022\n\rSO" +
- "URCE_CS_CS3\020\313\001\022\022\n\rSOURCE_CS_CS4\020\314\001\022\022\n\rSO" +
- "URCE_CS_CS5\020\315\001\022\r\n\010CS_ERROR\020\310\001\022\010\n\003CS1\020\311\001\022" +
- "\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003CS4\020\314\001\022\010\n\003CS5\020\315\001\022" +
- "\024\n\017SOURCE_JS_ERROR\020\254\002\022\022\n\rSOURCE_JS_ES1\020\255",
- "\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022\n\rSOURCE_JS_ES3\020\257" +
- "\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022\n\rSOURCE_JS_ES6\020\261" +
- "\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022\n\rSOURCE_JS_ES8\020\263" +
- "\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOURCE_PHP_ERROR\020\220\003\022" +
- "\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSOURCE_PHP5_3\020\222\003\022\022\n" +
- "\rSOURCE_PHP5_4\020\223\003\022\022\n\rSOURCE_PHP5_5\020\224\003\022\022\n" +
- "\rSOURCE_PHP5_6\020\225\003\022\022\n\rSOURCE_PHP7_0\020\226\003\022\022\n" +
- "\rSOURCE_PHP7_1\020\227\003\022\016\n\tPHP_ERROR\020\220\003\022\026\n\021SOU" +
- "RCE_HTML_ERROR\020\364\003\022\020\n\013Source_HTML\020\365\003\022\017\n\nH" +
- "TML_ERROR\020\364\003\022\025\n\020SOURCE_XML_ERROR\020\330\004\022\017\n\nS",
- "ource_XML\020\331\004\022\016\n\tXML_ERROR\020\330\004\022\025\n\020SOURCE_C" +
- "SS_ERROR\020\274\005\022\017\n\nSource_CSS\020\275\005\022\016\n\tCSS_ERRO" +
- "R\020\274\005\032\002\020\001B\002H\001"
+ " \003(\005\022\030\n\020previous_indices\030\n \003(\005\022\017\n\007ast_ke" +
+ "y\030\013 \001(\005\"\247\007\n\010FileKind\022\t\n\005OTHER\020\000\022\n\n\006BINAR",
+ "Y\020\001\022\010\n\004TEXT\020\002\022\007\n\003XML\020\003\022\025\n\021SOURCE_JAVA_ER" +
+ "ROR\020d\022\024\n\020SOURCE_JAVA_JLS2\020f\022\024\n\020SOURCE_JA" +
+ "VA_JLS3\020g\022\024\n\020SOURCE_JAVA_JLS4\020h\022\024\n\020SOURC" +
+ "E_JAVA_JLS8\020l\022\016\n\nJAVA_ERROR\020d\022\010\n\004JLS2\020f\022" +
+ "\010\n\004JLS3\020g\022\010\n\004JLS4\020h\022\010\n\004JLS8\020l\022\024\n\017SOURCE_" +
+ "CS_ERROR\020\310\001\022\022\n\rSOURCE_CS_CS1\020\311\001\022\022\n\rSOURC" +
+ "E_CS_CS2\020\312\001\022\022\n\rSOURCE_CS_CS3\020\313\001\022\022\n\rSOURC" +
+ "E_CS_CS4\020\314\001\022\022\n\rSOURCE_CS_CS5\020\315\001\022\r\n\010CS_ER" +
+ "ROR\020\310\001\022\010\n\003CS1\020\311\001\022\010\n\003CS2\020\312\001\022\010\n\003CS3\020\313\001\022\010\n\003" +
+ "CS4\020\314\001\022\010\n\003CS5\020\315\001\022\024\n\017SOURCE_JS_ERROR\020\254\002\022\022",
+ "\n\rSOURCE_JS_ES1\020\255\002\022\022\n\rSOURCE_JS_ES2\020\256\002\022\022" +
+ "\n\rSOURCE_JS_ES3\020\257\002\022\022\n\rSOURCE_JS_ES5\020\260\002\022\022" +
+ "\n\rSOURCE_JS_ES6\020\261\002\022\022\n\rSOURCE_JS_ES7\020\262\002\022\022" +
+ "\n\rSOURCE_JS_ES8\020\263\002\022\r\n\010JS_ERROR\020\254\002\022\025\n\020SOU" +
+ "RCE_PHP_ERROR\020\220\003\022\020\n\013SOURCE_PHP5\020\221\003\022\022\n\rSO" +
+ "URCE_PHP5_3\020\222\003\022\022\n\rSOURCE_PHP5_4\020\223\003\022\022\n\rSO" +
+ "URCE_PHP5_5\020\224\003\022\022\n\rSOURCE_PHP5_6\020\225\003\022\022\n\rSO" +
+ "URCE_PHP7_0\020\226\003\022\022\n\rSOURCE_PHP7_1\020\227\003\022\016\n\tPH" +
+ "P_ERROR\020\220\003\022\026\n\021SOURCE_HTML_ERROR\020\364\003\022\020\n\013So" +
+ "urce_HTML\020\365\003\022\017\n\nHTML_ERROR\020\364\003\022\025\n\020SOURCE_",
+ "XML_ERROR\020\330\004\022\017\n\nSource_XML\020\331\004\022\016\n\tXML_ERR" +
+ "OR\020\330\004\022\025\n\020SOURCE_CSS_ERROR\020\274\005\022\017\n\nSource_C" +
+ "SS\020\275\005\022\016\n\tCSS_ERROR\020\274\005\032\002\020\001B\002H\001"
};
com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner =
new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
@@ -3219,7 +3300,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors(
internal_static_boa_types_ChangedFile_fieldAccessorTable = new
com.google.protobuf.GeneratedMessage.FieldAccessorTable(
internal_static_boa_types_ChangedFile_descriptor,
- new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", });
+ new java.lang.String[] { "Change", "Kind", "Name", "Key", "Ast", "Comments", "Changes", "PreviousNames", "PreviousVersions", "PreviousIndices", "AstKey", });
return null;
}
};
diff --git a/src/java/boa/datagen/forges/github/GetReposByLanguage.java b/src/java/boa/datagen/forges/github/GetReposByLanguage.java
index 4c5dd5ba8..8eeef9135 100644
--- a/src/java/boa/datagen/forges/github/GetReposByLanguage.java
+++ b/src/java/boa/datagen/forges/github/GetReposByLanguage.java
@@ -1,26 +1,49 @@
package boa.datagen.forges.github;
+import java.io.BufferedWriter;
import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
+import java.util.HashSet;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
-import boa.datagen.util.FileIO;
+import boa.datagen.util.FileIO;
public class GetReposByLanguage {
-
+
+ // GET PROJECT WITH STARS LARGER OR EQUAL TO THIS NUMBER
+ static int stars = 0;
+ static long start, stop = 0;
+
public static void main(String[] args) {
+
+ if (args.length < 4) {
+ System.out.println("args: TOKEN_FILE_INPUT_PATH, OUTPUT_PATH, STARS, LANGS");
+ return;
+ }
+
TokenList tokens = new TokenList(args[0]);
String outDir = args[1];
- String[] languages = { "java" };
- if (args.length > 2) {
- languages = new String[args.length - 2];
- for (int i = 2; i < args.length; i++)
- languages[i - 2] = args[i];
+ stars = Integer.parseInt(args[2]);
+ String[] languages = args[3].split(";");
+
+ if (args.length > 3) {
+ String langArgs = "";
+ for (int i = 3; i < args.length; i++) {
+ langArgs += args[i];
+ }
+ languages = langArgs.split(",");
+ for (int i = 0; i < languages.length; i++)
+ languages[i] = languages[i].trim();
}
Thread[] workers = new Thread[languages.length];
for (int i = 0; i < languages.length; i++) {
@@ -35,91 +58,115 @@ public static void main(String[] args) {
e.printStackTrace();
}
}
-
-
+
public static class Worker implements Runnable {
- private final int id;
+ private final int id;
private final String language;
private TokenList tokens;
private final String outDir;
private JsonArray repos = new JsonArray();
private final int RECORDS_PER_FILE = 100;
private int counter = 0;
-
+ private HashSet processedRepID = new HashSet<>();
+ private ArrayList IDtoWrite = new ArrayList<>();
+
public Worker(int id, String language, String outDir, TokenList tokenList) {
this.id = id;
this.language = language;
this.outDir = outDir;
this.tokens = tokenList;
+ File processedRepos = new File(outDir + "/" + language + "processed.txt");
+ try {
+ processedRepos.createNewFile();
+ Files.lines(processedRepos.toPath()).forEach(repID -> processedRepID.add(Integer.parseInt(repID)));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
-
+
@Override
public void run() {
+ start = System.currentTimeMillis();
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.DATE, 1);
int year = cal.get(Calendar.YEAR);
int month = cal.get(Calendar.MONTH) + 1; // month starts from 0
int day = cal.get(Calendar.DAY_OF_MONTH);
+
+ String monthString = month < 10 ? "0" + month : String.valueOf(month);
+ String dayString = day < 10 ? "0" + day : String.valueOf(day);
+ String time = year + "-" + monthString + "-" + dayString + "T23:59:59Z";
+
// String time = "2018-12-21T01:01:01Z";
- String time = year + "-" + month + "-" + day + "T23:59:59Z";
+// String time = year + "-" + month + "-" + day + "T23:59:59Z";
Gson parser = new Gson();
- while (true){
- Token tok = this.tokens.getNextAuthenticToken("https://api.github.com/repositories");
- String url = "https://api.github.com/search/repositories?q=language:" + language +"+stars:>1+pushed:<=" + time + "&sort=updated&order=desc&per_page=100";
- System.out.println(url);
- MetadataCacher mc = new MetadataCacher(url, tok.getUserName(), tok.getToken());
- mc.authenticate();
- while (!mc.isAuthenticated() || mc.getNumberOfRemainingLimit() <= 0) {
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e1) {
- e1.printStackTrace();
- }
- mc = new MetadataCacher(url, tok.getUserName(), tok.getToken());
- mc.authenticate();
- }
- mc.getResponseJson();
- String content = mc.getContent();
+ Token tokenGetAPI = this.tokens.getNextAuthenticTokenM("https://api.github.com");
+ Token tokenSearch = this.tokens.getNextAuthenticTokenM("https://api.github.com/search/repositories?q=language");
+ MetadataCacher mcGetAPI = new MetadataCacher("https://api.github.com/repositories", tokenGetAPI.getUserName(), tokenGetAPI.getToken());
+ mcGetAPI.authenticate();
+ while (true) {
+ String searchURL = "https://api.github.com/search/repositories?q=language:" + language + "+stars:>=" + stars
+ + "+pushed:<=" + time + "&sort=updated&order=desc&per_page=100";
+ System.out.println(searchURL);
+ MetadataCacher mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken());
+ mcSearch.authenticate();
+ // We don't need to check for remaining limit for search because 30 requests/1 min is way faster than we could process, could be
+ // useful if in the future we can find a way to get 100 repos in 2 secs.
+// while (mcSearch.getNumberOfRemainingLimit() <= 0) {
+// System.out.println("user: " + tokenSearch.getUserName() + " limit: " + mcSearch.getNumberOfRemainingLimit());
+// tokenSearch = this.tokens.getNextAuthenticToken(searchURL);
+// mcSearch = new MetadataCacher(searchURL, tokenSearch.getUserName(), tokenSearch.getToken());
+// }
+ mcSearch.getResponseJson();
+ String content = mcSearch.getContent();
+
JsonObject json = null;
json = parser.fromJson(content, JsonElement.class).getAsJsonObject();
- JsonArray items = json.getAsJsonArray("items");
- if (items.size() > 0) {
- for (int j = 0; j < items.size(); j++) {
- JsonObject item = items.get(j).getAsJsonObject();
- this.addRepo(item);
- String pushed = item.get("pushed_at").getAsString();
- if (pushed.compareTo(time) < 0){
- time = pushed;
- }
- }
- }
- int count = json.get("total_count").getAsInt();
- if (count == items.size())
- break;
- if (tok.getNumberOfRemainingLimit() <= 1) {
- long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis();
- if (t >= 0) {
- System.out.println("Waiting " + (t/1000) + " seconds for sending more requests.");
- try {
- Thread.sleep(t);
- } catch (InterruptedException e) {
- e.printStackTrace();
+ JsonArray items = json.getAsJsonArray("items");
+
+ if (items.size() > 0) {
+ int getLimit = mcGetAPI.getNumberOfRemainingLimit();
+ System.out.println("Get API rate limit remaining: " + getLimit);
+
+ if (getLimit < items.size() + 1 || !mcGetAPI.isAuthenticated()) {
+ tokenGetAPI = this.tokens.getNextAuthenticToken("https://api.github.com/repositories", items.size() + 2); // 1 for getting authentic token, 1 for authenticating language link
+ mcGetAPI = new MetadataCacher(mcGetAPI.getUrl(), tokenGetAPI.getUserName(), tokenGetAPI.getToken());
+ }
+ for (int j = 0; j < items.size(); j++) {
+ JsonObject item = items.get(j).getAsJsonObject();
+ // check if repository is already saved
+ int repID = item.get("id").getAsInt();
+ if (!processedRepID.contains(repID)) {
+ mcGetAPI = addLanguageToRepo(item, parser, mcGetAPI);
+
+ this.addRepo(item);
+ processedRepID.add(repID);
+ IDtoWrite.add(repID);
+ } else {
+ System.out.println(repID + " already written");
+ }
+ String pushed = item.get("pushed_at").getAsString();
+ if (pushed.compareTo(time) < 0) {
+ time = pushed;
}
}
+ System.out.println(" ");
}
+ int count = json.get("total_count").getAsInt(); // count will not be static
+ if (count == items.size())
+ break;
}
writeRemainingRepos();
- }
-
+ }
+
private void addRepo(JsonObject repo) {
File fileToWriteJson = null;
this.repos.add(repo);
if (this.repos.size() % RECORDS_PER_FILE == 0) {
- fileToWriteJson = new File(
- outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
+ fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
while (fileToWriteJson.exists()) {
System.out.println(fileToWriteJson.getAbsolutePath() + " arleady exist");
counter++;
@@ -128,6 +175,20 @@ private void addRepo(JsonObject repo) {
FileIO.writeFileContents(fileToWriteJson, this.repos.toString());
System.out.println(Thread.currentThread().getId() + " " + counter++);
this.repos = new JsonArray();
+ try (FileWriter fw = new FileWriter(outDir + "/" + language + "processed.txt", true);
+ BufferedWriter bw = new BufferedWriter(fw);
+ PrintWriter out = new PrintWriter(bw)) {
+ for (Integer repID : IDtoWrite) {
+ out.println(repID.intValue());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ IDtoWrite.clear();
+ }
+ stop = System.currentTimeMillis();
+ System.out.println("Time taken: " + (stop - start) / 1000.0 + "seconds");
+ start = stop;
}
}
@@ -140,8 +201,25 @@ public void writeRemainingRepos() {
fileToWriteJson = new File(outDir + "/Thread-" + this.id + "-page-" + counter + ".json");
}
FileIO.writeFileContents(fileToWriteJson, this.repos.toString());
- System.out.println(this.id + counter++);
+ System.out.println(this.id + counter++);
}
}
+
+ // Returning metadataCacher so that we don't have to authenticate again to get remaining limit
+ private MetadataCacher addLanguageToRepo(JsonObject repo, Gson parser, MetadataCacher mc) {
+ String langurl = "https://api.github.com/repos/" + repo.get("full_name").getAsString() + "/languages";
+ mc = new MetadataCacher(langurl, mc.getUsername(), mc.getPassword());
+ if (mc.authenticate()) {
+ mc.getResponse();
+ String pageContent = mc.getContent();
+ JsonObject languages = parser.fromJson(pageContent, JsonElement.class).getAsJsonObject();
+ repo.add("language_list", languages);
+ } else {
+ final int responsecode = mc.getResponseCode();
+ System.err.println("authentication error " + responsecode);
+ }
+ return mc;
+ }
+
}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/MetadataCacher.java b/src/java/boa/datagen/forges/github/MetadataCacher.java
index 5966292bc..ef4be5ede 100644
--- a/src/java/boa/datagen/forges/github/MetadataCacher.java
+++ b/src/java/boa/datagen/forges/github/MetadataCacher.java
@@ -44,14 +44,26 @@ public void setUrl(String url) {
e.printStackTrace();
}
}
+
+ public String getUserName() {
+ return this.username;
+ }
public void setUsername(String username) {
this.username = username;
}
+
+ public String getUsername() {
+ return username;
+ }
public void setPassword(String password) {
this.password = password;
}
+
+ public String getPassword() {
+ return password;
+ }
public boolean isAuthenticated() {
return authenticated;
@@ -79,6 +91,7 @@ public boolean authenticate(String username, String password) {
} catch (IOException e) {
// considered as failed
}
+// System.out.println("authenticate: " + this.authenticated);
return this.authenticated;
}
@@ -151,10 +164,14 @@ public int getNumberOfMaxLimit() {
}
public int getNumberOfRemainingLimit() {
- return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining"));
+ try {
+ return Integer.parseInt(this.connection.getHeaderField("X-RateLimit-Remaining"));
+ } catch(NumberFormatException e) {
+ return -1;
+ }
}
public long getLimitResetTime() {
return Long.parseLong(this.connection.getHeaderField("X-RateLimit-Reset"));
}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/TokenList.java b/src/java/boa/datagen/forges/github/TokenList.java
index dca3a7e1c..6907e7d95 100644
--- a/src/java/boa/datagen/forges/github/TokenList.java
+++ b/src/java/boa/datagen/forges/github/TokenList.java
@@ -33,17 +33,18 @@ public TokenList(String path) {
}
}
+
public Token getNextAuthenticToken(String url) {
MetadataCacher mc = null;
while (true) {
for (Token token : tokens) {
- // System.out.println("Trying token: " + token.getId());
mc = new MetadataCacher(url, token.getUserName(), token.getToken());
- if (mc.authenticate()) {
+ if (mc.authenticate() && mc.getNumberOfRemainingLimit() >= 1) {
if (this.lastUsedToken != token.getId()) {
this.lastUsedToken = token.getId();
- System.out.println("now using token: " + token.getId());
+// System.out.println("now using token: " + token.getId());
}
+// System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName());
return token;
}
}
@@ -57,6 +58,105 @@ public Token getNextAuthenticToken(String url) {
// throw new IllegalArgumentException();
}
+
+ public MetadataCacher getNextAuthenticMetadataCacher(String url) {
+ MetadataCacher mc = null;
+ while (true) {
+ for (Token token : tokens) {
+ System.out.println("Trying token " + token.getId());
+ mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+ if (mc.authenticate()) {
+ if (this.lastUsedToken != token.getId()) {
+ this.lastUsedToken = token.getId();
+// System.out.println("now using token: " + token.getId());
+ }
+ System.out.println("Use authentic token: " + token.getId() + " user: " + token.getUserName());
+ return mc;
+ }
+ // the web is 404
+ if (mc.getNumberOfRemainingLimit() >= 1)
+ return null;
+ }
+ try {
+ System.out.println("waiting for token, going to sleep for 10s");
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ // throw new IllegalArgumentException();
+ }
+
+ public Token getNextAuthenticTokenM(String url) {
+ MetadataCacher mc = null;
+ while (true) {
+ for (Token token : tokens) {
+ mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+ System.out.println("Trying token: " + token.getUserName() + " for " + url.substring(0, Math.min(url.length(), 40)) + ((url.length() < 40) ? "" : "..."));
+ mc.authenticate();
+ System.out.println(mc.getUsername() + " " + mc.isAuthenticated());
+ if (mc.getNumberOfRemainingLimit() >= 0) {
+ System.out.println(mc.getNumberOfRemainingLimit());
+ if (this.lastUsedToken != token.getId()) {
+ this.lastUsedToken = token.getId();
+ System.out.println("now using token: " + token.getUserName());
+ }
+ return token;
+ }
+ }
+ try {
+// long t = mc.getLimitResetTime() * 1000 - System.currentTimeMillis();
+// if (t >= 0) { // could be useful if json is created too fast
+// System.out.println("Waiting " + (t / 1000) + " seconds for sending more requests.");
+// try {
+// Thread.sleep(t);
+// } catch (InterruptedException e) {
+// e.printStackTrace();
+// }
+// }
+ System.out.println("waiting for token, going to sleep for 10s");
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ // throw new IllegalArgumentException();
+ }
+
+
+ public Token getNextAuthenticToken(String url, int minRateLimit) {
+ MetadataCacher mc = null;
+ while (true) {
+ for (Token token : tokens) {
+ System.out.println("Trying token: " + token.getUserName() + " ");
+ mc = new MetadataCacher(url, token.getUserName(), token.getToken());
+ mc.authenticate();
+ if (mc.isAuthenticated()) {
+ int limitRemaining = mc.getNumberOfRemainingLimit();
+ if (limitRemaining < minRateLimit) {
+ System.out.println("Authenticated but have " + limitRemaining + " than min rate limit of " + minRateLimit);
+ continue;
+ }
+ if (this.lastUsedToken != token.getId()) {
+ this.lastUsedToken = token.getId();
+ System.out.println("Now using token: " + token.getUserName() + " ");
+ }
+ System.out.println(mc.getNumberOfRemainingLimit());
+ return token;
+ } else {
+ System.err.println("Can't authenticate, response code:" + mc.getResponseCode());
+ }
+ }
+ try {
+ System.out.println("waiting for token, going to sleep for 10s");
+ Thread.sleep(10000);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ // throw new IllegalArgumentException();
+ }
public synchronized Token getAuthenticatedToken(long threadId) {
while (true) {
@@ -83,4 +183,4 @@ public synchronized void removeToken(Token tok) {
public synchronized void addToken(Token tok) {
this.tokens.add(tok);
}
-}
+}
\ No newline at end of file
diff --git a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
similarity index 65%
rename from src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java
rename to src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
index b0e435b4b..2c59b6452 100644
--- a/src/java/boa/datagen/forges/github/GitHubRepoBareDownloader.java
+++ b/src/java/boa/datagen/slurm/GitHubRepoBareDownloader.java
@@ -1,4 +1,4 @@
-package boa.datagen.forges.github;
+package boa.datagen.slurm;
import java.io.File;
import org.eclipse.jgit.api.Git;
@@ -8,6 +8,7 @@
import boa.datagen.util.FileIO;
+// Datagen Phase 1: download all bare repositories
public class GitHubRepoBareDownloader {
private static String INPUT_PATH; // The directory contains a list of repo json files
@@ -19,62 +20,63 @@ public static void main(String[] args) {
if (args.length < 3) {
System.out.println("args: INPUT_NAMES_PATH, OUTPUT_REPOS_PATH, THREAD_NUM");
- } else {
- INPUT_PATH = args[0];
- OUTPUT_REPOS_PATH = args[1];
- THREAD_NUM = Integer.parseInt(args[2]);
-
- File input = new File(INPUT_PATH);
-
- DownloadWorker[] workers = new DownloadWorker[THREAD_NUM];
- Thread[] threads = new Thread[THREAD_NUM];
- for (int i = 0; i < THREAD_NUM; i++) {
- workers[i] = new DownloadWorker(i);
- threads[i] = new Thread(workers[i]);
- threads[i].start();
- }
+ return;
+ }
- // assign tasks to workers
- for (File file : input.listFiles()) {
- if (!file.getName().endsWith(".json"))
- continue;
- JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file));
- for (JsonElement je : jsonTree.getAsJsonArray()) {
- String projectName = je.getAsJsonObject().get("html_url").getAsString()
- .replace("https://github.com/", "");
-
- boolean assigned = false;
- while (!assigned) {
- for (int j = 0; j < THREAD_NUM; j++) {
- if (workers[j].isReady()) {
- workers[j].setName(projectName);
- workers[j].setReady(false);
- assigned = true;
- break;
- }
- }
- try {
- Thread.sleep(100);
- } catch (InterruptedException e) {
- e.printStackTrace();
+ INPUT_PATH = args[0];
+ OUTPUT_REPOS_PATH = args[1];
+ THREAD_NUM = Integer.parseInt(args[2]);
+
+ File input = new File(INPUT_PATH);
+
+ DownloadWorker[] workers = new DownloadWorker[THREAD_NUM];
+ Thread[] threads = new Thread[THREAD_NUM];
+ for (int i = 0; i < THREAD_NUM; i++) {
+ workers[i] = new DownloadWorker(i);
+ threads[i] = new Thread(workers[i]);
+ threads[i].start();
+ }
+
+ // assign tasks to workers
+ for (File file : input.listFiles()) {
+ if (!file.getName().endsWith(".json"))
+ continue;
+ JsonElement jsonTree = new JsonParser().parse(FileIO.readFileContents(file));
+ for (JsonElement je : jsonTree.getAsJsonArray()) {
+ String projectName = je.getAsJsonObject().get("html_url").getAsString().replace("https://github.com/",
+ "");
+
+ boolean assigned = false;
+ while (!assigned) {
+ for (int j = 0; j < THREAD_NUM; j++) {
+ if (workers[j].isReady()) {
+ workers[j].setName(projectName);
+ workers[j].setReady(false);
+ assigned = true;
+ break;
}
}
- }
- }
-
- // wait for all done
- for (int j = 0; j < THREAD_NUM; j++) {
- while (!workers[j].isReady())
try {
Thread.sleep(100);
} catch (InterruptedException e) {
e.printStackTrace();
}
+ }
}
+ }
- setDone(true);
+ // wait for all done
+ for (int j = 0; j < THREAD_NUM; j++) {
+ while (!workers[j].isReady())
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
}
+ setDone(true);
+
}
synchronized static boolean getDone() {
@@ -138,7 +140,7 @@ private void runJob() {
result.getRepository().close();
}
} else {
- System.out.println("repo " + projectName + "already exists");
+ System.out.println("repo " + projectName + " already exists");
}
}
diff --git a/src/java/boa/datagen/slurm/SeqRepoBuilder.java b/src/java/boa/datagen/slurm/SeqRepoBuilder.java
new file mode 100644
index 000000000..9c182ca73
--- /dev/null
+++ b/src/java/boa/datagen/slurm/SeqRepoBuilder.java
@@ -0,0 +1,304 @@
+package boa.datagen.slurm;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+
+import com.google.gson.Gson;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+
+import boa.datagen.DefaultProperties;
+import boa.datagen.forges.github.RepoMetadata;
+import boa.datagen.scm.AbstractConnector;
+import boa.datagen.scm.GitConnector;
+import boa.datagen.util.FileIO;
+import boa.types.Code.CodeRepository;
+import boa.types.Code.Revision;
+import boa.types.Toplevel.Project;
+
+//Datagen Phase 2: DATAGEN_JAR used for each slurm job
+public class SeqRepoBuilder {
+
+ private static String REPO_PATH;
+ private static String JSON_FILES_PATH;
+ private static String OUTPUT_PATH;
+
+ private static Configuration conf = null;
+ private static FileSystem fileSystem = null;
+
+ private static String suffix;
+ private static SequenceFile.Writer projectWriter, astWriter, commitWriter, contentWriter;
+ private static long astWriterLen = 1, commitWriterLen = 1, contentWriterLen = 1;
+
+ private static int MAX_COMMITS = Integer.valueOf(DefaultProperties.MAX_SIZE_FOR_PROJECT_WITH_COMMITS);;
+
+ public static void main(String[] args) throws IOException {
+
+ if (args.length < 3) {
+ System.err.println("Need args:\n" + "REPO_PATH\n" + "JSON_FILES_PATH\n" + "OUTPUT_PATH\n");
+ return;
+ }
+
+ REPO_PATH = args[0];
+ JSON_FILES_PATH = args[1];
+ OUTPUT_PATH = args[2];
+
+ conf = new Configuration();
+ fileSystem = FileSystem.get(conf);
+ boa.datagen.DefaultProperties.DEBUG = true;
+
+ int counter = 0;
+ for (String jsonFilePath : getJsonFilePaths()) {
+ File file = new File(jsonFilePath);
+ String content = FileIO.readFileContents(file);
+ Gson parser = new Gson();
+ JsonArray repoArray = null;
+ try {
+ repoArray = parser.fromJson(content, JsonElement.class).getAsJsonArray();
+ } catch (Exception e) {
+ System.err.println("Error proccessing page: " + file.getPath());
+ e.printStackTrace();
+ continue;
+ }
+ // iterate each json object (project metadata) in the json array
+ for (int i = 0; i < repoArray.size(); i++) {
+ JsonObject rp = repoArray.get(i).getAsJsonObject();
+ RepoMetadata repo = new RepoMetadata(rp);
+ if (repo.id != null && repo.name != null) {
+ System.out.println("Processing the " + (++counter) + "th project: " + repo.name);
+ // generate seq files for this project
+ Project project = repo.toBoaMetaDataProtobuf();
+ process(project);
+ }
+ }
+ }
+
+ // done
+ }
+
+ private static void process(Project project) {
+ String projectName = project.getName();
+ String[] writerPaths = openWriters(projectName);
+
+ // if writerPaths is null, then the project is processed.
+ if (writerPaths == null) {
+ System.out.println(projectName + " seq file is already existing");
+ return;
+ }
+
+ try {
+ project = storeRepository(project, 0);
+ // if the project is null then skip this project
+ if (project == null) {
+ System.out.println(projectName + " is null skip this");
+ clear(writerPaths);
+ return;
+ }
+
+ // store project into sequence file
+ BytesWritable bw = new BytesWritable(project.toByteArray());
+ if (bw.getLength() <= MAX_COMMITS || (project.getCodeRepositoriesCount() > 0
+ && project.getCodeRepositories(0).getRevisionKeysCount() > 0)) {
+ // Approach 1: if the Project size is acceptable, then directly append the
+ // Project instance into the sequence file
+ projectWriter.append(new Text(project.getId()), bw);
+ } else {
+ // Approach 2: if the size is too large, extract Commit instances and append
+ // them into commit sequence file.
+ Project.Builder pb = Project.newBuilder(project);
+ for (CodeRepository.Builder cb : pb.getCodeRepositoriesBuilderList()) {
+ for (Revision.Builder rb : cb.getRevisionsBuilderList()) {
+ cb.addRevisionKeys(commitWriterLen);
+ bw = new BytesWritable(rb.build().toByteArray());
+ commitWriter.append(new LongWritable(commitWriterLen), bw);
+ commitWriterLen += bw.getLength();
+ }
+ cb.clearRevisions();
+ }
+ projectWriter.append(new Text(pb.getId()), new BytesWritable(pb.build().toByteArray()));
+ }
+ } catch (Throwable e) {
+ e.printStackTrace();
+ clear(writerPaths);
+ return;
+ }
+
+ System.out.println(projectName + " finished");
+ closeWriters();
+ }
+
+ private static void clear(String[] writerPaths) {
+ closeWriters();
+ // remove sequence files
+ for (String path : writerPaths) {
+ File file = new File(path);
+ if (file.exists())
+ org.apache.commons.io.FileUtils.deleteQuietly(file);
+ }
+ }
+
+ private static Project storeRepository(final Project project, final int i) {
+ final CodeRepository repo = project.getCodeRepositories(i); // this is an empty code repo
+ final Project.Builder projBuilder = Project.newBuilder(project);
+
+ final String name = project.getName();
+ File gitDir = new File(REPO_PATH + "/" + name);
+
+ // return null to skip empty project
+ if (isFiltered(project)) {
+ System.err.println(name + " is filtered");
+ return null;
+ }
+
+ AbstractConnector conn = null;
+ try {
+ conn = new GitConnector(gitDir.getAbsolutePath(), project.getName(), astWriter, astWriterLen, commitWriter,
+ commitWriterLen, contentWriter, contentWriterLen);
+ final CodeRepository.Builder repoBuilder = CodeRepository.newBuilder(repo);
+
+ List