Merge remote branch 'origin/master'

Alcamech · Oct 10, 2015 · af70b51 · af70b51
1 parent 90ad647
commit af70b51
Show file tree

Hide file tree

Showing 82 changed files with 5,713 additions and 3,461 deletions.
diff --git a/doc/loglinear/QUICKSTART.txt b/doc/loglinear/QUICKSTART.txt
@@ -2,7 +2,7 @@ loglinear package quickstart:
 
 First, read the ConcatVector section in ARCH.txt.
 
-To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.learning.CoNLLBenchmark.
+To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.CoNLLBenchmark.
 
 #####################################################
 

diff --git a/doc/loglinear/README.txt b/doc/loglinear/README.txt
@@ -1,6 +1,6 @@
 For an explanation of how everything fits together, see ARCH.txt
 
-For a quick runnable object, go run edu.stanford.nlp.loglinear.learning.CoNLLBenchmark in core's test package.
+For a quick runnable object, go run edu.stanford.nlp.loglinear.CoNLLBenchmark in core's test package.
 
 For a tutorial, see QUICKSTART.txt
 

diff --git a/itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java b/itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java
@@ -0,0 +1,135 @@
+package edu.stanford.nlp.ie.qe;
+
+import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
+import edu.stanford.nlp.pipeline.*;
+import junit.framework.TestCase;
+
+import java.util.List;
+
+/**
+ * Test for quantifiable entity extractor
+ * @author Angel Chang
+ */
+public class QuantifiableEntityExtractorITest extends TestCase {
+  static AnnotationPipeline pipeline = null;
+  static QuantifiableEntityExtractor extractor = null;
+
+  public void test() throws Exception {
+    // TODO: Enable tests after rules files are added to models
+  }
+
+  @Override
+  public void setUp() throws Exception {
+    synchronized(QuantifiableEntityExtractorITest.class) {
+      if (pipeline == null) {
+        pipeline = new AnnotationPipeline();
+        pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
+        pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
+        pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false));
+        //pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
+      }
+      extractor = new QuantifiableEntityExtractor();
+      //extractor.init(new Options());
+    }
+  }
+
+  protected static Annotation createDocument(String text) {
+    Annotation annotation = new Annotation(text);
+    pipeline.annotate(annotation);
+    return annotation;
+  }
+
+  public static class ExpectedQuantity {
+    String text;
+    String normalizedValue;
+    String type;
+
+    public ExpectedQuantity(String text, String normalizedValue, String type) {
+      this.text = text;
+      this.normalizedValue = normalizedValue;
+      this.type = type;
+    }
+  }
+
+  public void runAndCheck(String prefix, String[] sentences, ExpectedQuantity[][] expected) throws Exception {
+    for (int si = 0; si < sentences.length; si++) {
+      String sentence = sentences[si];
+      Annotation annotation = createDocument(sentence);
+      List<MatchedExpression> matchedExpressions = extractor.extract(annotation);
+
+      // Print out matched text and value
+      if (expected == null) {
+        for (int i = 0; i < matchedExpressions.size(); i++) {
+          String text = matchedExpressions.get(i).getText();
+          Object value = matchedExpressions.get(i).getValue();
+          System.out.println(prefix + ": Got expression " + text + " with value " + value);
+        }
+        assertTrue(prefix + ": No expected provided", false);
+      } else {
+        int minMatchable = Math.min(expected[si].length, matchedExpressions.size());
+        for (int i = 0; i < minMatchable; i++) {
+          ExpectedQuantity expectedQuantity = expected[si][i];
+          MatchedExpression matched = matchedExpressions.get(i);
+          SimpleQuantifiableEntity actualQuantity = (SimpleQuantifiableEntity) matched.getValue().get();
+          assertEquals(prefix + ".matched." + si + "." + i + ".text", expectedQuantity.text, matched.getText());
+          assertEquals(prefix + ".matched." + si + "." + i + ".normalizedValue", expectedQuantity.normalizedValue, actualQuantity.toString());
+          assertEquals(prefix + ".matched." + si + "." + i + ".type", expectedQuantity.type, actualQuantity.getUnit().type);
+        }
+        assertEquals(prefix + ".length." + si, expected[si].length, matchedExpressions.size());
+      }
+    }
+  }
+
+  public void _testMoney() throws Exception {
+    String[] sentences = {
+        "I have 1 dollar and 2 cents.",
+        "It cost 10 thousand million dollars."
+    };
+    // TODO: merge the 1 dollar and 2 cents
+    ExpectedQuantity[][] expected = {
+        {new ExpectedQuantity("1 dollar", "$1.00", "MONEY"), new ExpectedQuantity("2 cents", "$0.02", "MONEY")},
+        {new ExpectedQuantity("10 thousand million dollars", "$10000000000.00", "MONEY")}
+    };
+
+    runAndCheck("testMoney", sentences, expected);
+  }
+
+  public void _testLength() throws Exception {
+    String[] sentences = {
+        "We are 2 kilometer away.",
+        "We are 2 kilometers away.",
+        "We turn after 5 miles.",
+        "The box is 100 centimeters tall.",
+        "The box is 10cm wide.",
+        "The box is over 1000 mm long.",
+        "The box is 2ft long."
+    };
+    ExpectedQuantity[][] expected = {
+        {new ExpectedQuantity("2 kilometer", "2000.0m", "LENGTH")},
+        {new ExpectedQuantity("2 kilometers", "2000.0m", "LENGTH")},
+        {new ExpectedQuantity("5 miles", "5.0mi", "LENGTH")},
+        {new ExpectedQuantity("100 centimeters", "1.0m", "LENGTH")},
+        {new ExpectedQuantity("10cm", "0.1m", "LENGTH")},
+        {new ExpectedQuantity("1000 mm", "1.0m", "LENGTH")},
+        {new ExpectedQuantity("2ft", "2.0'", "LENGTH")}
+    };
+    runAndCheck("testLength", sentences, expected);
+  }
+
+  // We do weight instead of mass since in typical natural language
+  //  kilograms are used to refer to weight vs mass (in scientific usage)
+  public void _testWeight() throws Exception {
+    String[] sentences = {
+        "The ball is 2 kilograms in weight.",
+        "There are five grams.",
+        "How much is seven pounds?"
+    };
+    ExpectedQuantity[][] expected = {
+        {new ExpectedQuantity("2 kilograms", "2.0kg", "WEIGHT")},
+        {new ExpectedQuantity("five grams", "0.005kg", "WEIGHT")},
+        {new ExpectedQuantity("seven pounds", "7.0lb", "WEIGHT")}
+    };
+    runAndCheck("testWeight", sentences, expected);
+  }
+
+}
diff --git a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java
@@ -1,21 +1,21 @@
 package edu.stanford.nlp.ling.tokensregex;
 
-import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.pipeline.*;
+import edu.stanford.nlp.process.CoreLabelTokenFactory;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.process.TokenizerFactory;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Pair;
-import edu.stanford.nlp.util.StringUtils;
 import edu.stanford.nlp.util.Timing;
 import junit.framework.TestCase;
 
-import java.io.File;
 import java.io.IOException;
+import java.io.StringReader;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 import java.util.regex.Pattern;
 
 public class TokenSequenceMatcherITest extends TestCase {
@@ -94,6 +94,50 @@ public void testTokenSequenceMatcherValue() throws IOException {
     assertFalse(match);
   }
 
+  public void testTokenSequenceMatcherBeginEnd() throws IOException {
+    CoreMap doc = createDocument(testText);
+
+    // Test simple sequence with begin sequence matching
+    TokenSequencePattern p = TokenSequencePattern.compile("^ [] []");
+    TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+
+    boolean match = m.find();
+    assertTrue(match);
+    assertEquals("the number", m.group());
+
+    match = m.find();
+    assertFalse(match);
+
+    // Test simple sequence with end sequence matching
+    p = TokenSequencePattern.compile("[] [] $");
+    m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+
+    match = m.find();
+    assertTrue(match);
+    assertEquals("fifty.", m.group());
+
+    match = m.find();
+    assertFalse(match);
+
+    // Test simple sequence with begin and end sequence matching
+    p = TokenSequencePattern.compile("^ [] [] $");
+    m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+
+    match = m.find();
+    assertFalse(match);
+
+    // Test simple sequence with ^$ in a string regular expression
+    p = TokenSequencePattern.compile("/^number$/");
+    m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+
+    match = m.find();
+    assertTrue(match);
+    assertEquals("number", m.group());
+
+    match = m.find();
+    assertFalse(match);
+  }
+
   private static final String testText1 = "Mellitus was the first Bishop of London, the third Archbishop of Canterbury, and a member of the Gregorian mission  sent to England to convert the Anglo-Saxons. He arrived in 601 AD, and was consecrated as Bishop of London in 604.";
   public void testTokenSequenceMatcher1() throws IOException {
     CoreMap doc = createDocument(testText1);
@@ -179,7 +223,7 @@ public void testTokenSequenceMatcher1() throws IOException {
     match = m.find();
     assertTrue(match);
     assertEquals(0, m.groupCount());
-    assertEquals("London in 604 .", m.group());
+    assertEquals("London in 604.", m.group());
     match = m.find();
     assertFalse(match);
   }
@@ -435,6 +479,31 @@ public void testTokenSequenceMatcherConj() throws IOException {
     assertFalse(match);
   }
 
+  public void testTokenSequenceMatcherConj2() throws IOException {
+    String content = "The cat is sleeping on the floor.";
+    String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))";
+
+    TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
+    List<CoreLabel> tokens = tf.getTokenizer(new StringReader(content)).tokenize();
+    TokenSequencePattern seqPattern = TokenSequencePattern.compile(greedyPattern);
+    TokenSequenceMatcher matcher = seqPattern.getMatcher(tokens);
+
+    boolean entireMatch = matcher.matches();
+    assertTrue(entireMatch);
+
+    boolean match = matcher.find();
+    assertTrue(match);
+    assertEquals("The cat is sleeping on the floor.", matcher.group());
+
+    String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))";
+    TokenSequencePattern seqPattern2 = TokenSequencePattern.compile(reluctantPattern);
+    TokenSequenceMatcher matcher2 = seqPattern2.getMatcher(tokens);
+
+    match = matcher2.find();
+    assertTrue(match);
+    assertEquals("The cat is sleeping", matcher2.group());
+  }
+
   public void testTokenSequenceMatcherConjAll() throws IOException {
     CoreMap doc = createDocument(testText1);
     TokenSequencePattern p = TokenSequencePattern.compile(
@@ -979,7 +1048,7 @@ public void testTokenSequenceOptimizeOrString() throws IOException {
       TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
       boolean match = m.find();
       assertTrue(match);
-      assertEquals("atropine we need to have many many words here but we do n't sweating", m.group(0));
+      assertEquals("atropine we need to have many many words here but we don't sweating", m.group(0));
 
       match = m.find();
       assertFalse(match);
@@ -1005,7 +1074,7 @@ public void testMultiplePatterns() throws IOException {
     CoreMap doc = createDocument("atropine we need to have many many words here but we don't sweating");
     MultiPatternMatcher<CoreMap> multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(p1, p2);
     List<String> expected = new ArrayList<String>();
-    expected.add("atropine we need to have many many words here but we do n't sweating");
+    expected.add("atropine we need to have many many words here but we don't sweating");
     Iterator<String> expectedIter = expected.iterator();
 
     Iterable<SequenceMatchResult<CoreMap>> matches =
@@ -1187,7 +1256,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
     match = m.find();
     assertTrue(match);
     assertEquals(0, m.groupCount());
-    assertEquals("January 3 , 2002", m.group());
+    assertEquals("January 3, 2002", m.group());
     match = m.find();
     assertFalse(match);
 
@@ -1196,7 +1265,7 @@ public void testTokenSequenceMatcherNumber() throws IOException {
     match = m.find();
     assertTrue(match);
     assertEquals(0, m.groupCount());
-    assertEquals("January 3 , 2002", m.group());
+    assertEquals("January 3, 2002", m.group());
     match = m.find();
     assertFalse(match);
 
@@ -1404,6 +1473,32 @@ public void testTokenSequenceMatcherMultiNodePattern() throws IOException {
     assertFalse(match);
   }
 
+  public void testTokenSequenceMatcherMultiNodePattern2() throws IOException {
+    CoreMap doc = createDocument("Replace the lamp with model wss.32dc55c3e945384dbc5e533ab711fd24");
+
+    // Greedy
+    TokenSequencePattern p = TokenSequencePattern.compile("/model/ ((?m){1,4}/\\w+\\.\\w+/)");
+    TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+    boolean match = m.find();
+    assertTrue(match);
+    assertEquals(1, m.groupCount());
+    assertEquals("model wss.32dc55c3e945384dbc5e533ab711fd24", m.group());
+    assertEquals("wss.32dc55c3e945384dbc5e533ab711fd24", m.group(1));
+    match = m.find();
+    assertFalse(match);
+
+    // Reluctant
+    p = TokenSequencePattern.compile("/model/ ((?m){1,4}?/\\w+\\.\\w+/)");
+    m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
+    match = m.find();
+    assertTrue(match);
+    assertEquals(1, m.groupCount());
+    assertEquals("model wss.32", m.group());
+    assertEquals("wss.32", m.group(1));
+    match = m.find();
+    assertFalse(match);
+  }
+
   public void testTokenSequenceMatcherBackRef() throws IOException {
     CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A");
 
@@ -1488,17 +1583,18 @@ public void testCompile() {
     //assertEquals(m.group(), "matching this");
   }
 
-  //This DOES NOT work right now!!
-//  public void testCompile2(){
+  public void testBindingCompile(){
+    Env env = TokenSequencePattern.getNewEnv();
+    env.bind("wordname",CoreAnnotations.TextAnnotation.class);
+    String s = "[wordname:\"name\"]{1,2}";
+    TokenSequencePattern p = TokenSequencePattern.compile(env, s);
+  }
+
+// // This does not work!!!
+//  public void testNoBindingCompile(){
 //    Env env = TokenSequencePattern.getNewEnv();
-//    env.bind("wordname",CoreAnnotations.TextAnnotation.class);
 //    String s = "[" + CoreAnnotations.TextAnnotation.class.getName()+":\"name\"]{1,2}";
 //    TokenSequencePattern p = TokenSequencePattern.compile(env, s);
-//    for(Map.Entry<String, Object> vars: env.getVariables().entrySet()){
-//      if(vars.getValue().equals(CoreAnnotations.TextAnnotation.class)){
-//        System.out.println("Found " + vars.getKey() + " binding for " + vars.getValue());
-//      }
-//    }
 //  }
 
   public void testCaseInsensitive1(){

diff --git a/...lp/loglinear/learning/CoNLLBenchmark.java → ...tanford/nlp/loglinear/CoNLLBenchmark.java b/...lp/loglinear/learning/CoNLLBenchmark.java → ...tanford/nlp/loglinear/CoNLLBenchmark.java
@@ -1,6 +1,9 @@
-package edu.stanford.nlp.loglinear.learning;
+package edu.stanford.nlp.loglinear;
 
 import edu.stanford.nlp.loglinear.inference.CliqueTree;
+import edu.stanford.nlp.loglinear.learning.AbstractBatchOptimizer;
+import edu.stanford.nlp.loglinear.learning.BacktrackingAdaGradOptimizer;
+import edu.stanford.nlp.loglinear.learning.LogLikelihoodFunction;
 import edu.stanford.nlp.loglinear.model.ConcatVector;
 import edu.stanford.nlp.loglinear.model.GraphicalModel;
 import edu.stanford.nlp.util.HashIndex;

diff --git a/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java b/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java
@@ -46,15 +46,15 @@ public void testDependencyParserEnglishSD() {
   }
 
   // Lower because we're evaluating on PTB + extraDevTest, not just PTB
-  private static final double EnglishUdLas = 84.9873;
+  private static final double EnglishUdLas = 88.72648417258083;
 
   /**
    * Test that the NN dependency parser performance doesn't change.
    */
   public void testDependencyParserEnglishUD() {
     DependencyParser parser = new DependencyParser();
     parser.loadModelFile("/u/nlp/data/depparser/nn/distrib-2015-04-16/english_UD.gz");
-    double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/USD/dev.conll", null);
+    double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/UD-converted/dev.conll", null);
     assertEquals(String.format("English UD LAS should be %.2f but was %.2f",
         EnglishUdLas, las), EnglishUdLas, las, 1e-4);
   }