Skip to content

Commit

Permalink
Merge branch 'master' into gsoc17
Browse files Browse the repository at this point in the history
  • Loading branch information
chrismattmann committed Aug 9, 2017
2 parents 1941a29 + e5526b5 commit b422eda
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 45 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
Release 1.17 - ???

* OOXML locale should be set by POI's LocaleUtil not Locale.getDefault().
Fix unit tests to be robust against different locales in OOXML
and ExcelParser (TIKA-2438).

* Upgrade to PDFBox 2.0.7 (TIKA-2431).

* Tika now has support for automatic image captioning, that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,18 @@ public void addText(char[] cbuf, int off, int len) {
writer.write(' ');
}

/**
* {@inheritDoc}
*
* @throws IllegalStateException if no models have been loaded with
* {@link #loadModels() } or {@link #loadModels(java.util.Set) }
* @return the detected list of languages
*/
@Override
public List<LanguageResult> detectAll() {
// TODO throw exception if models haven't been loaded, or auto-load all?
if(detector == null) {
throw new IllegalStateException("models haven't been loaded yet (forgot to call loadModels?)");
}

List<LanguageResult> result = new ArrayList<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
Expand Down Expand Up @@ -179,7 +180,7 @@ protected void parse(
break;
case WORKBOOK:
case XLR:
Locale locale = context.get(Locale.class, Locale.getDefault());
Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
break;
case PROJECT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.LocaleUtil;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
Expand Down Expand Up @@ -61,7 +62,7 @@ public static void parse(
InputStream stream, ContentHandler baseHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale());
ExtractorFactory.setThreadPrefersEventExtractors(true);

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,28 @@ public void testBigIntegersWGeneralFormat() throws Exception {
DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
//16 digit number is treated as scientific notation as is the 16 digit formula
assertContains("1"+symbols.getDecimalSeparator()+"23456789012345E+15</td>\t"+
"<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml); }
"<td>1"+symbols.getDecimalSeparator()+"23456789012345E+15", xml);
}

@Test
public void testBigIntegersWGeneralFormatWLocaleIT() throws Exception {
LocaleUtil.setUserLocale(Locale.ITALIAN);
//TIKA-2438
try {
String xml = getXML("testEXCEL_big_numbers.xlsx").xml;
assertContains("123456789012345", xml);//15 digit number
assertContains("123456789012346", xml);//15 digit formula
Locale locale = LocaleUtil.getUserLocale();

DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale);
//16 digit number is treated as scientific notation as is the 16 digit formula
assertContains("1" + symbols.getDecimalSeparator() + "23456789012345E+15</td>\t" +
"<td>1" + symbols.getDecimalSeparator() + "23456789012345E+15", xml);
} finally {
LocaleUtil.setUserLocale(USER_LOCALE);
}
}


@Test
public void testBoldHyperlink() throws Exception {
Expand Down Expand Up @@ -1540,47 +1561,53 @@ public void testExcelXLSB() throws Exception {

@Test
public void testXLSBVarious() throws Exception {
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
assertEquals(4, metadataList.size());

String xml = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("<td>13</td>", xml);
assertContains("<td>13.1211231321</td>", xml);
assertContains("<td>$ 3.03</td>", xml);
assertContains("<td>20%</td>", xml);
assertContains("<td>13.12</td>", xml);
assertContains("<td>123456789012345</td>", xml);
assertContains("<td>1.23456789012345E+15</td>", xml);
assertContains("test comment2", xml);

assertContains("comment4 (end of row)", xml);


assertContains("<td>1/4</td>", xml);
assertContains("<td>3/9/17</td>", xml);
assertContains("<td>4</td>", xml);
assertContains("<td>2</td>", xml);

assertContains("<td> 46/1963</td>", xml);
assertContains("<td> 3/128</td>", xml);
assertContains("test textbox", xml);

assertContains("test WordArt", xml);

assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);

assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);

assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
try {
LocaleUtil.setUserLocale(Locale.US);
//have to set to US because of a bug in POI for $ 3.03 in Locale.ITALIAN
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setExtractMacros(true);
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
assertEquals(4, metadataList.size());

String xml = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("<td>13</td>", xml);
assertContains("<td>13.1211231321</td>", xml);
assertContains("<td>$ 3.03</td>", xml);
assertContains("<td>20%</td>", xml);
assertContains("<td>13.12</td>", xml);
assertContains("<td>123456789012345</td>", xml);
assertContains("<td>1.23456789012345E+15</td>", xml);
assertContains("test comment2", xml);

assertContains("comment4 (end of row)", xml);


assertContains("<td>1/4</td>", xml);
assertContains("<td>3/9/17</td>", xml);
assertContains("<td>4</td>", xml);
assertContains("<td>2</td>", xml);

assertContains("<td> 46/1963</td>", xml);
assertContains("<td> 3/128</td>", xml);
assertContains("test textbox", xml);

assertContains("test WordArt", xml);

assertContains("<a href=\"http://lucene.apache.org/\">http://lucene.apache.org/</a>", xml);
assertContains("<a href=\"http://tika.apache.org/\">http://tika.apache.org/</a>", xml);

assertContains("OddLeftHeader OddCenterHeader OddRightHeader", xml);
assertContains("EvenLeftHeader EvenCenterHeader EvenRightHeader", xml);

assertContains("FirstPageLeftHeader FirstPageCenterHeader FirstPageRightHeader", xml);
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
} finally {
LocaleUtil.setUserLocale(USER_LOCALE);
}
}

@Test
Expand Down

0 comments on commit b422eda

Please sign in to comment.