From 3f5f5e172a56816c6f0f9d07fe7efe11e6a2a698 Mon Sep 17 00:00:00 2001 From: robert2 Date: Tue, 31 Jul 2018 08:43:42 -0500 Subject: [PATCH 1/7] More useful error message on write errors --- src/java/boa/datagen/scm/AbstractCommit.java | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index ce397004c..579dde317 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -526,8 +526,11 @@ private boolean parsePHPFile(final String path, final ChangedFile.Builder fb, fi BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (IOException e) { - e.printStackTrace(); + } catch (Exception e) { + if (debug) { + System.err.println("ast write error with " + projectName); + e.printStackTrace(); + } } } return !errorCheck.hasError; @@ -581,8 +584,11 @@ private boolean parseJavaScriptFile(final String path, final ChangedFile.Builder BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (IOException e) { - e.printStackTrace(); + } catch (Exception e) { + if (debug) { + System.err.println("ast write error with " + projectName); + e.printStackTrace(); + } } // fb.setComments(comments); } @@ -730,9 +736,11 @@ private boolean parseJavaFile(final String path, final ChangedFile.Builder fb, f BytesWritable bw = new BytesWritable(preAst.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (IOException e) { - if (debug) + } catch (Exception e) { + if (debug) { + System.err.println("ast write error with " + projectName); e.printStackTrace(); + } plen = Long.MAX_VALUE; } } From da42aa9fe27335cf4981815f2e61ddd7cc2472b3 Mon Sep 17 00:00:00 2001 From: robert2 Date: Tue, 31 Jul 2018 16:46:28 -0500 Subject: [PATCH 2/7] revert last change --- src/java/boa/datagen/scm/AbstractCommit.java | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index 579dde317..ce397004c 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -526,11 +526,8 @@ private boolean parsePHPFile(final String path, final ChangedFile.Builder fb, fi BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (Exception e) { - if (debug) { - System.err.println("ast write error with " + projectName); - e.printStackTrace(); - } + } catch (IOException e) { + e.printStackTrace(); } } return !errorCheck.hasError; @@ -584,11 +581,8 @@ private boolean parseJavaScriptFile(final String path, final ChangedFile.Builder BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (Exception e) { - if (debug) { - System.err.println("ast write error with " + projectName); - e.printStackTrace(); - } + } catch (IOException e) { + e.printStackTrace(); } // fb.setComments(comments); } @@ -736,11 +730,9 @@ private boolean parseJavaFile(final String path, final ChangedFile.Builder fb, f BytesWritable bw = new BytesWritable(preAst.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (Exception e) { - if (debug) { - System.err.println("ast write error with " + projectName); + } catch (IOException e) { + if (debug) e.printStackTrace(); - } plen = Long.MAX_VALUE; } } From f9fde1b9fda71f0f6c7ede713ffa34e28ad1766d Mon Sep 17 00:00:00 2001 From: robert2 Date: Wed, 1 Aug 2018 08:44:10 -0500 Subject: [PATCH 3/7] clean up --- src/java/boa/datagen/scm/AbstractCommit.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index 5731a223e..a76e7bd86 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -48,8 +48,6 @@ import boa.types.Shared.Person; import boa.datagen.DefaultProperties; import boa.datagen.dependencies.PomFile; -import boa.datagen.treed.TreedConstants; -import boa.datagen.treed.TreedMapper; import boa.datagen.util.CssVisitor; import boa.datagen.util.FileIO; import boa.datagen.util.HtmlVisitor; @@ -60,7 +58,6 @@ import boa.datagen.util.PHPVisitor; import boa.datagen.util.Properties; import boa.datagen.util.XMLVisitor; -import boa.datagen.util.JavaASTUtil; import boa.datagen.util.JavaErrorCheckVisitor; /** @@ -652,9 +649,11 @@ private boolean parseJavaFile(final String path, final ChangedFile.Builder fb, f BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (IOException e) { - if (debug) + } catch (Exception e) { + if (debug) { + System.err.println("ast write error on project " + projectName); e.printStackTrace(); + } } // fb.setComments(comments); } From b37f994bb944fedadd86470e4be4463a7e63d56a Mon Sep 17 00:00:00 2001 From: Bob Schmidt Date: Wed, 1 Aug 2018 09:01:40 -0500 Subject: [PATCH 4/7] Revert "clean up" --- src/java/boa/datagen/scm/AbstractCommit.java | 15 +++++++++++++++ src/java/boa/datagen/scm/AbstractConnector.java | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index a76e7bd86..ba360650c 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -207,6 +207,7 @@ else if (lowerPath.endsWith(".jar") || lowerPath.endsWith(".class")) fb.setKind(FileKind.BINARY); else if (lowerPath.endsWith(".java") && parse) { final String content = getFileContents(path); + fb.setKind(FileKind.SOURCE_JAVA_ERROR); parseJavaFile(path, fb, content, false); } else if (lowerPath.endsWith(".js") && parse) { @@ -243,6 +244,13 @@ else if (lowerPath.endsWith(".java") && parse) { System.err.println( "Found ES4 parse error in: revision " + id + ": file " + path); fb.setKind(FileKind.SOURCE_JS_ERROR); + // try { + // astWriter.append(new + // LongWritable(len), new + // BytesWritable(ASTRoot.newBuilder().build().toByteArray())); + // } catch (IOException e) { + // e.printStackTrace(); + // } } else if (debugparse) System.err.println("Accepted ES8: revision " + id + ": file " + path); } else if (debugparse) @@ -291,6 +299,13 @@ else if (lowerPath.endsWith(".java") && parse) { System.err.println( "Found ES4 parse error in: revision " + id + ": file " + path); fb.setKind(FileKind.SOURCE_PHP_ERROR); + // try { + // astWriter.append(new + // LongWritable(len), new + // BytesWritable(ASTRoot.newBuilder().build().toByteArray())); + // } catch (IOException e) { + // e.printStackTrace(); + // } } else if (debugparse) System.err.println("Accepted PHP7_1: revision " + id + ": file " + path); } else if (debugparse) diff --git a/src/java/boa/datagen/scm/AbstractConnector.java b/src/java/boa/datagen/scm/AbstractConnector.java index b0f8ed52c..0c10c1293 100644 --- a/src/java/boa/datagen/scm/AbstractConnector.java +++ b/src/java/boa/datagen/scm/AbstractConnector.java @@ -90,7 +90,7 @@ public List buildSnapshot(final int commitOffset, final String[] la final Map commits = new HashMap(); getSnapshot(commitOffset, snapshot, commits); - if (languages == null || languages.length == 0) + if (languages == null) return snapshot; boolean hasJava = false; From 8cb649f9b6f14f00568b4f03e31ec7314b286c22 Mon Sep 17 00:00:00 2001 From: Bob Schmidt Date: Wed, 1 Aug 2018 09:02:51 -0500 Subject: [PATCH 5/7] Revert "Revert "clean up"" --- src/java/boa/datagen/scm/AbstractCommit.java | 15 --------------- src/java/boa/datagen/scm/AbstractConnector.java | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index ba360650c..a76e7bd86 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -207,7 +207,6 @@ else if (lowerPath.endsWith(".jar") || lowerPath.endsWith(".class")) fb.setKind(FileKind.BINARY); else if (lowerPath.endsWith(".java") && parse) { final String content = getFileContents(path); - fb.setKind(FileKind.SOURCE_JAVA_ERROR); parseJavaFile(path, fb, content, false); } else if (lowerPath.endsWith(".js") && parse) { @@ -244,13 +243,6 @@ else if (lowerPath.endsWith(".java") && parse) { System.err.println( "Found ES4 parse error in: revision " + id + ": file " + path); fb.setKind(FileKind.SOURCE_JS_ERROR); - // try { - // astWriter.append(new - // LongWritable(len), new - // BytesWritable(ASTRoot.newBuilder().build().toByteArray())); - // } catch (IOException e) { - // e.printStackTrace(); - // } } else if (debugparse) System.err.println("Accepted ES8: revision " + id + ": file " + path); } else if (debugparse) @@ -299,13 +291,6 @@ else if (lowerPath.endsWith(".java") && parse) { System.err.println( "Found ES4 parse error in: revision " + id + ": file " + path); fb.setKind(FileKind.SOURCE_PHP_ERROR); - // try { - // astWriter.append(new - // LongWritable(len), new - // BytesWritable(ASTRoot.newBuilder().build().toByteArray())); - // } catch (IOException e) { - // e.printStackTrace(); - // } } else if (debugparse) System.err.println("Accepted PHP7_1: revision " + id + ": file " + path); } else if (debugparse) diff --git a/src/java/boa/datagen/scm/AbstractConnector.java b/src/java/boa/datagen/scm/AbstractConnector.java index 0c10c1293..b0f8ed52c 100644 --- a/src/java/boa/datagen/scm/AbstractConnector.java +++ b/src/java/boa/datagen/scm/AbstractConnector.java @@ -90,7 +90,7 @@ public List buildSnapshot(final int commitOffset, final String[] la final Map commits = new HashMap(); getSnapshot(commitOffset, snapshot, commits); - if (languages == null) + if (languages == null || languages.length == 0) return snapshot; boolean hasJava = false; From ea9613e234eca0a750cc57a9a29bc822b1474817 Mon Sep 17 00:00:00 2001 From: robert2 Date: Wed, 1 Aug 2018 14:24:39 -0500 Subject: [PATCH 6/7] revert Catch message change --- src/java/boa/datagen/scm/AbstractCommit.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/java/boa/datagen/scm/AbstractCommit.java b/src/java/boa/datagen/scm/AbstractCommit.java index a76e7bd86..180de4597 100644 --- a/src/java/boa/datagen/scm/AbstractCommit.java +++ b/src/java/boa/datagen/scm/AbstractCommit.java @@ -649,11 +649,10 @@ private boolean parseJavaFile(final String path, final ChangedFile.Builder fb, f BytesWritable bw = new BytesWritable(ast.build().toByteArray()); connector.astWriter.append(new LongWritable(connector.astWriterLen), bw); connector.astWriterLen += bw.getLength(); - } catch (Exception e) { - if (debug) { - System.err.println("ast write error on project " + projectName); + } catch (IOException e) { + if (debug) e.printStackTrace(); - } + } // fb.setComments(comments); } From 2ff22a5d988c695ca0e6275654cbcba968eba492 Mon Sep 17 00:00:00 2001 From: robert2 Date: Thu, 2 Aug 2018 09:43:39 -0500 Subject: [PATCH 7/7] Process one project at a time. --- src/java/boa/datagen/BoaGenerator.java | 47 ++- src/java/boa/datagen/SeqRepoImporterJson.java | 364 ++++++++++++++++++ 2 files changed, 395 insertions(+), 16 deletions(-) create mode 100644 src/java/boa/datagen/SeqRepoImporterJson.java diff --git a/src/java/boa/datagen/BoaGenerator.java b/src/java/boa/datagen/BoaGenerator.java index 900ed7bd2..a99bff595 100644 --- a/src/java/boa/datagen/BoaGenerator.java +++ b/src/java/boa/datagen/BoaGenerator.java @@ -37,6 +37,7 @@ public class BoaGenerator { private static boolean jsonAvailable = true; private static boolean tokenAvailable = false; + private static boolean cacheJson = false; public static void main(final String[] args) throws IOException { final Options options = new Options(); @@ -58,23 +59,33 @@ public static void main(final String[] args) throws IOException { */ if (jsonAvailable) { - CacheGithubJSON.main(new String[0]); - try { - SeqRepoImporter.main(new String[0]); - } catch (InterruptedException e) { - e.printStackTrace(); + if (cacheJson) { + try { + SeqRepoImporterJson.main(new String[0]); + } catch (InterruptedException e) { + e.printStackTrace(); + } + + } else { + CacheGithubJSON.main(new String[0]); + try { + SeqRepoImporter.main(new String[0]); + } catch (InterruptedException e) { + e.printStackTrace(); + } } SeqCombiner.main(new String[0]); -// try { -// MapFileGen.main(new String[0]); -// } catch (Exception e) { -// e.printStackTrace(); -// } - } else if (tokenAvailable) { // when user provides local repo and does not have json files + // try { + // MapFileGen.main(new String[0]); + // } catch (Exception e) { + // e.printStackTrace(); + // } + } else if (tokenAvailable) { // when user provides local repo and does + // not have json files MetaDataMaster mdm = new MetaDataMaster(); mdm.downloadRepoNames(DefaultProperties.TOKEN, DefaultProperties.OUTPUT); - + SeqCombiner.main(new String[0]); } else { // when user provides local repo and does not have json files File output = new File(DefaultProperties.OUTPUT); @@ -87,7 +98,7 @@ public static void main(final String[] args) throws IOException { e.printStackTrace(); } } - + clear(); } @@ -110,7 +121,8 @@ private static void addOptions(Options options) { options.addOption("inputRepo", "json", true, "cloned repo path"); options.addOption("threads", "threads", true, "number of threads"); options.addOption("projects", "projects", true, "maximum number of projects per sequence file"); - options.addOption("commits", "commits", true, "maximum number of commits of a project to be stored in the project object"); + options.addOption("commits", "commits", true, + "maximum number of commits of a project to be stored in the project object"); options.addOption("size", "size", true, "maximum size of a project object to be stored"); options.addOption("libs", "libs", true, "directory to store libraries"); options.addOption("output", "json", true, "directory where output is desired"); @@ -121,6 +133,7 @@ private static void addOptions(Options options) { options.addOption("cache", "json", false, "enable if you want to delete the cloned code for user."); options.addOption("debug", "json", false, "enable for debug mode."); options.addOption("debugparse", "json", false, "enable for debug mode when parsing source files."); + options.addOption("cacheJson", "cacheJson", false, "enable to process one project at a time."); options.addOption("help", "help", true, "help"); } @@ -134,7 +147,7 @@ private static void handleCmdOptions(CommandLine cl, Options options, final Stri DefaultProperties.GH_JSON_PATH = cl.getOptionValue("inputJson"); DefaultProperties.OUTPUT = cl.getOptionValue("output"); DefaultProperties.GH_GIT_PATH = cl.getOptionValue("output"); - }else if (cl.hasOption("inputToken") && cl.hasOption("inputRepo") && cl.hasOption("output")) { + } else if (cl.hasOption("inputToken") && cl.hasOption("inputRepo") && cl.hasOption("output")) { DefaultProperties.TOKEN = cl.getOptionValue("inputToken"); DefaultProperties.OUTPUT = cl.getOptionValue("output"); // DefaultProperties.GH_GIT_PATH = GH_JSON_CACHE_PATH + "/github"; @@ -185,6 +198,9 @@ private static void handleCmdOptions(CommandLine cl, Options options, final Stri if (cl.hasOption("debug")) { DefaultProperties.DEBUG = true; } + if (cl.hasOption("cacheJson")) { + cacheJson = true; + } if (cl.hasOption("debugparse")) { DefaultProperties.DEBUGPARSE = true; } @@ -202,7 +218,6 @@ private static void clear() { if (inputDirectory.exists()) org.apache.commons.io.FileUtils.deleteQuietly(inputDirectory); } - private static void getGithubMetadata(String inputPath, String username, String password, String targetUser, String targetRepo) { diff --git a/src/java/boa/datagen/SeqRepoImporterJson.java b/src/java/boa/datagen/SeqRepoImporterJson.java new file mode 100644 index 000000000..0391e6497 --- /dev/null +++ b/src/java/boa/datagen/SeqRepoImporterJson.java @@ -0,0 +1,364 @@ +package boa.datagen; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.SequenceFile.CompressionType; + +import com.google.gson.Gson; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.protobuf.InvalidProtocolBufferException; + +import boa.datagen.forges.github.RepoMetadata; +import boa.datagen.forges.github.RepositoryCloner; +import boa.datagen.scm.AbstractConnector; +import boa.datagen.scm.GitConnector; +import boa.datagen.util.FileIO; +import boa.datagen.util.Properties; +import boa.types.Code.CodeRepository; +import boa.types.Code.Revision; +import boa.types.Toplevel.Project; + +public class SeqRepoImporterJson { + private final static boolean debug = Properties.getBoolean("debug", DefaultProperties.DEBUG); + private final static boolean cache = Properties.getBoolean("cache", DefaultProperties.CACHE); + + private final static File gitRootPath = new File( + Properties.getProperty("gh.svn.path", DefaultProperties.GH_GIT_PATH)); + + private final static HashSet processedProjectIds = new HashSet(); + + private static Configuration conf = null; + private static FileSystem fileSystem = null; + private static String base = null; + + private final static int poolSize = Integer + .parseInt(Properties.getProperty("num.threads", DefaultProperties.NUM_THREADS)); + public static final int MAX_SIZE_FOR_PROJECT_WITH_COMMITS = Integer + .valueOf(DefaultProperties.MAX_SIZE_FOR_PROJECT_WITH_COMMITS); + // private static int processedProjects = 0; + final static String jsonPath = Properties.getProperty("gh.json.path", DefaultProperties.GH_JSON_PATH); + final static String jsonCachePath = Properties.getProperty("output.path", DefaultProperties.OUTPUT); + + public static void main(String[] args) throws IOException, InterruptedException { + + conf = new Configuration(); + fileSystem = FileSystem.get(conf); + base = Properties.getProperty("output.path", DefaultProperties.OUTPUT); + + getProcessedProjects(); + ImportTask[] workers = new ImportTask[poolSize]; + + for (int i = 0; i < poolSize; i++) { + ImportTask worker = new ImportTask(i); + workers[i] = worker; + workers[i].openWriters(); + } + + File dir = new File(jsonPath); + for (File file : dir.listFiles()) { + if (file.getName().endsWith(".json")) { + String content = FileIO.readFileContents(file); + Gson parser = new Gson(); + + JsonArray repoArray = null; + try { + repoArray = parser.fromJson(content, JsonElement.class).getAsJsonArray(); + } catch (Exception e) { + System.err.println("error proccessing page: " + file.getPath()); + e.printStackTrace(); + } + for (int i = 0; i < repoArray.size(); i++) { + RepoMetadata repo = new RepoMetadata(repoArray.get(i).getAsJsonObject()); + // JsonObject rp = repoArray.get(i).getAsJsonObject(); + if (repo.id != null && repo.name != null) { + try { + Project protobufRepo = repo.toBoaMetaDataProtobuf(); + // System.out.println(jRepo.toString()); + boolean assigned = false; + while (!assigned) { + for (int j = 0; j < poolSize; j++) { + if (workers[j].isReady()) { + workers[j].setProject(protobufRepo.toByteArray()); + new Thread(workers[j]).start(); + assigned = true; + Thread.sleep(10); + break; + } + } + Thread.sleep(10); + } + } catch (Exception e) { + System.err.println("error proccessing page: " + file.getPath()); + e.printStackTrace(); + } + System.out.println(file.getPath() + ": " + i + ": " + repo.id + " " + repo.name); + } + } + } + } + + for (ImportTask worker : workers) { + while (!worker.isReady()) + Thread.sleep(100); + worker.closeWriters(); + } + } + + private static void getProcessedProjects() throws IOException { + FileStatus[] files = fileSystem.listStatus(new Path(base + "/project")); + for (int i = 0; i < files.length; i++) { + FileStatus file = files[i]; + String name = file.getPath().getName(); + if (name.endsWith(".seq")) { + SequenceFile.Reader r = null; + try { + r = new SequenceFile.Reader(fileSystem, file.getPath(), conf); + final Text key = new Text(); + while (r.next(key)) { + processedProjectIds.add(key.toString()); + } + r.close(); + } catch (IOException e) { + if (r != null) + r.close(); + for (String dir : new String[] { "ast", "commit", "source" }) + fileSystem.delete(new Path(base + "/" + dir + "/" + name), false); + } + } + } + // processedProjects = processedProjectIds.size(); + System.out.println("Got processed projects: " + processedProjectIds.size()); + } + + public static class ImportTask implements Runnable { + private int id; + private int counter = 0; + private String suffix; + private SequenceFile.Writer projectWriter, astWriter, commitWriter, contentWriter; + private long astWriterLen = 0, commitWriterLen = 0, contentWriterLen = 0; + private boolean ready = true; + byte[] bs; + + public ImportTask(int id) { + this.id = id; + } + + public void setProject(byte[] project) { + this.bs = project; + }; + + public boolean isReady() { + return this.ready; + } + + public void openWriters() { + long time = System.currentTimeMillis(); + suffix = id + "-" + time + ".seq"; + while (true) { + try { + projectWriter = SequenceFile.createWriter(fileSystem, conf, new Path(base + "/project/" + suffix), + Text.class, BytesWritable.class, CompressionType.BLOCK); + astWriter = SequenceFile.createWriter(fileSystem, conf, new Path(base + "/ast/" + suffix), + LongWritable.class, BytesWritable.class, CompressionType.BLOCK); + commitWriter = SequenceFile.createWriter(fileSystem, conf, new Path(base + "/commit/" + suffix), + LongWritable.class, BytesWritable.class, CompressionType.BLOCK); + contentWriter = SequenceFile.createWriter(fileSystem, conf, new Path(base + "/source/" + suffix), + LongWritable.class, BytesWritable.class, CompressionType.BLOCK); + break; + } catch (Throwable t) { + t.printStackTrace(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + } + } + } + } + + public void closeWriters() { + while (true) { + try { + projectWriter.close(); + astWriter.close(); + commitWriter.close(); + contentWriter.close(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + } + break; + } catch (Throwable t) { + t.printStackTrace(); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + } + } + } + } + + @Override + public void run() { + this.ready = false; + + try { + Project cachedProject = null; + try { + cachedProject = Project.parseFrom(bs); + if (processedProjectIds.contains(cachedProject.getId())) { + this.ready = true; + return; + } + } catch (InvalidProtocolBufferException e) { + e.printStackTrace(); + this.ready = true; + return; + } + bs = null; + + final String name = cachedProject.getName(); + + if (debug) + System.out.println( + Thread.currentThread().getId() + " Processing " + cachedProject.getId() + " " + name); + + Project project = storeRepository(cachedProject, 0); + + if (debug) + System.out + .println(Thread.currentThread().getId() + " Putting in sequence file: " + project.getId()); + + // store the project metadata + BytesWritable bw = new BytesWritable(project.toByteArray()); + if (bw.getLength() < MAX_SIZE_FOR_PROJECT_WITH_COMMITS) { + try { + projectWriter.append(new Text(project.getId()), bw); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + Project.Builder pb = Project.newBuilder(project); + for (CodeRepository.Builder cb : pb.getCodeRepositoriesBuilderList()) { + for (Revision.Builder rb : cb.getRevisionsBuilderList()) { + cb.addRevisionKeys(commitWriterLen); + bw = new BytesWritable(rb.build().toByteArray()); + commitWriter.append(new LongWritable(commitWriterLen), bw); + commitWriterLen += bw.getLength(); + } + cb.clearRevisions(); + } + try { + projectWriter.append(new Text(pb.getId()), new BytesWritable(pb.build().toByteArray())); + } catch (IOException e) { + e.printStackTrace(); + } + } + counter++; + if (counter >= Integer.parseInt(DefaultProperties.MAX_PROJECTS)) { + closeWriters(); + openWriters(); + counter = 0; + } + } catch (Throwable e) { + e.printStackTrace(); + } + this.ready = true; + System.out.println(this.id + " counter " + counter); + } + + private Project storeRepository(final Project project, final int i) { + final CodeRepository repo = project.getCodeRepositories(i); + final Project.Builder projBuilder = Project.newBuilder(project); + + final String name = project.getName(); + File gitDir = new File(gitRootPath + "/" + name); + + if (project.getForked() || !(project.getProgrammingLanguagesList().contains("Java") + || project.getProgrammingLanguagesList().contains("JavaScript") + || project.getProgrammingLanguagesList().contains("PHP")) + // || project.getStars() < 2 || project.getSize() < 100 + ) + return project; + + // If repository is already cloned delete then re-clone, this should + // only happen during recover + FileIO.DirectoryRemover filecheck = new FileIO.DirectoryRemover(gitRootPath + "/" + project.getName()); + filecheck.run(); + + String[] args = { repo.getUrl(), gitDir.getAbsolutePath() }; + try { + RepositoryCloner.clone(args); + } catch (Throwable t) { + System.err.println("Error cloning " + repo.getUrl()); + t.printStackTrace(); + return project; + } + + if (debug) + System.out.println(Thread.currentThread().getId() + " Has repository: " + name); + AbstractConnector conn = null; + try { + conn = new GitConnector(gitDir.getAbsolutePath(), project.getName(), astWriter, astWriterLen, + contentWriter, contentWriterLen); + final CodeRepository.Builder repoBuilder = CodeRepository.newBuilder(repo); + for (final Revision rev : conn.getCommits(true, project.getName())) { + // build new rev w/ no namespaces + final Revision.Builder revBuilder = Revision.newBuilder(rev); + repoBuilder.addRevisions(revBuilder); + } + if (repoBuilder.getRevisionsCount() > 0) { + if (debug) + System.out.println(Thread.currentThread().getId() + " Build head snapshot"); + repoBuilder.setHead(conn.getHeadCommitOffset()); + repoBuilder.addAllHeadSnapshot(conn.buildHeadSnapshot(new String[] {}, project.getName())); + } + repoBuilder.addAllBranches(conn.getBranchIndices()); + repoBuilder.addAllBranchNames(conn.getBranchNames()); + repoBuilder.addAllTags(conn.getTagIndices()); + repoBuilder.addAllTagNames(conn.getTagNames()); + + projBuilder.setCodeRepositories(i, repoBuilder); + return projBuilder.build(); + } catch (final Throwable e) { + printError(e, "unknown error", project.getName()); + } finally { + if (conn != null) { + this.astWriterLen = conn.getAstWriterLen(); + this.contentWriterLen = conn.getContentWriterLen(); + try { + conn.close(); + } catch (Exception e) { + printError(e, "Cannot close Git connector to " + gitDir.getAbsolutePath(), project.getName()); + } + } + if (!cache) { + new Thread(new FileIO.DirectoryRemover(gitRootPath + "/" + project.getName())).start(); + } + } + + return project; + } + } + + public static void printError(final Throwable e, final String message, String name) { + System.err.println("ERR: " + message + " proccessing: " + name); + if (debug) { + e.printStackTrace(); + // System.exit(-1); + } else + System.err.println(e.getMessage()); + } +}