Skip to content

Commit

Permalink
TIKA-2440 -- extract phonetic runs from xls and allow users to turn o…
Browse files Browse the repository at this point in the history
…ff extraction of phonetic runs in both xls and xlsx.
  • Loading branch information
tballison committed Aug 29, 2017
1 parent 87033d6 commit 74574e3
Show file tree
Hide file tree
Showing 10 changed files with 141 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
Release 1.17 - ???

* Extract phonetic runs from xls and allow users to turn off extraction
of phonetic runs in both xls and xlsx (TIKA-2440).

* OOXML locale should be set by POI's LocaleUtil not Locale.getDefault().
Fix unit tests to be robust against different locales in OOXML
and ExcelParser (TIKA-2438).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,14 @@ public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
public void setExtractMacros(boolean extractMacros) {
defaultOfficeParserConfig.setExtractMacros(extractMacros);
}

@Field
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
defaultOfficeParserConfig.setConcatenatePhoneticRuns(concatenatePhoneticRuns);
}

void getConcatenatePhoneticRuns() {
defaultOfficeParserConfig.getConcatenatePhoneticRuns();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtSSTRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FooterRecord;
import org.apache.poi.hssf.record.FormatRecord;
Expand Down Expand Up @@ -165,7 +166,7 @@ protected void parse(
Biff8EncryptionKey.setCurrentUserPassword(getPassword());

// Have the file processed in event mode
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this, officeParserConfig);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();

Expand Down Expand Up @@ -204,6 +205,8 @@ private static class TikaHSSFListener implements HSSFListener {
* @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
*/
private final NumberFormat format;

private final OfficeParserConfig officeParserConfig;
/**
* Potential exception thrown by the content handler. When set to
* non-<code>null</code>, causes all subsequent HSSF records to be
Expand Down Expand Up @@ -253,12 +256,13 @@ private static class TikaHSSFListener implements HSSFListener {
*
* @param handler Destination to write the parsed output to
*/
private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor, OfficeParserConfig officeParserConfig) {
this.handler = handler;
this.extractor = extractor;
this.format = NumberFormat.getInstance(locale);
this.formatListener = new TikaFormatTrackingHSSFListener(this, locale);
this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale);
this.officeParserConfig = officeParserConfig;
}

/**
Expand All @@ -280,6 +284,7 @@ public void processFile(DirectoryNode root, boolean listenForAllRecords)

// Set up listener and register the records we want to process
HSSFRequest hssfRequest = new HSSFRequest();
listenForAllRecords = true;
if (listenForAllRecords) {
hssfRequest.addListenerForAllRecords(formatListener);
} else {
Expand Down Expand Up @@ -426,7 +431,18 @@ private void internalProcessRecord(Record record) throws SAXException, TikaExcep
case LabelSSTRecord.sid: // Ref. a string in the shared string table
LabelSSTRecord sst = (LabelSSTRecord) record;
UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
addTextCell(record, unicode.getString());
String cellString = null;
if (officeParserConfig.getConcatenatePhoneticRuns()) {
String phonetic = (unicode != null
&& unicode.getExtendedRst() != null
&& unicode.getExtendedRst().getPhoneticText() != null
&& unicode.getExtendedRst().getPhoneticText().trim().length() > 0) ?
unicode.getExtendedRst().getPhoneticText() : "";
cellString = unicode.getString()+" "+phonetic;
} else {
cellString = unicode.getString();
}
addTextCell(record, cellString);
break;

case NumberRecord.sid: // Contains a numeric cell value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class OfficeParserConfig implements Serializable {
private boolean includeMoveFromContent = false;
private boolean includeShapeBasedContent = true;
private boolean includeHeadersAndFooters = true;
private boolean concatenatePhoneticRuns = true;

private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
Expand Down Expand Up @@ -97,7 +98,7 @@ public boolean getIncludeMoveFromContent() {
public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
this.includeShapeBasedContent = includeShapeBasedContent;
}

public boolean getIncludeShapeBasedContent() {
return includeShapeBasedContent;
}
Expand Down Expand Up @@ -149,6 +150,26 @@ public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
public boolean getUseSAXPptxExtractor() {
return useSAXPptxExtractor;
}


public boolean getConcatenatePhoneticRuns() {
return concatenatePhoneticRuns;
}

/**
* Microsoft Excel files can sometimes contain phonetic (furigana) strings.
* See <a href="https://support.office.com/en-us/article/PHONETIC-function-9a329dac-0c0f-42f8-9a55-639086988554">PHONETIC</a>.
* This sets whether or not the parser will concatenate the phonetic runs to the original text.
* <p>
* This is currently only supported by the xls and xlsx parsers (not the xlsb parser),
* and the default is <code>true</code>.
* </p>
*
* @param concatenatePhoneticRuns
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
}


Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale)
((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
//given that we load our own shared strings table, setting:
//((XSSFEventBasedExcelExtractor)extractor).setConcatenatePhoneticRuns();
//does no good here.
}

@Override
Expand Down Expand Up @@ -132,7 +135,7 @@ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
styles = xssfReader.getStylesTable();

iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
strings = new ReadOnlySharedStringsTable(container);
strings = new ReadOnlySharedStringsTable(container, config.getConcatenatePhoneticRuns());
} catch (InvalidFormatException e) {
throw new XmlException(e);
} catch (OpenXML4JException oe) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,4 +520,28 @@ public void testTurningOffTextBoxExtractionExcel() throws Exception {
String xml = getXML("testEXCEL_textbox.xls", pc).xml;
assertNotContained("autoshape", xml);
}

@Test
public void testPhoneticStrings() throws Exception {
//This unit test and test file come from Apache POI 51519.xlsx

//test default concatenates = true
assertContains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xls").xml);

//test turning it off
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setConcatenatePhoneticRuns(false);
ParseContext pc = new ParseContext();
pc.set(OfficeParserConfig.class, officeParserConfig);
assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xls", pc).xml);

//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(OfficeParser.class.getResourceAsStream("tika-config-exclude-phonetic.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xls", parser).xml);

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
Expand All @@ -44,6 +45,7 @@
import java.util.regex.Pattern;

import org.apache.poi.util.LocaleUtil;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
Expand Down Expand Up @@ -1709,6 +1711,31 @@ public void testPPTXChartData() throws Exception {
assertNotContained("chartSpace", xml);
}

@Test
public void testXLSXPhoneticStrings() throws Exception {
//This unit test and test file come from Apache POI 51519.xlsx

//test default concatenates = true
assertContains("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xlsx").xml);

//test turning it off
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setConcatenatePhoneticRuns(false);
ParseContext pc = new ParseContext();
pc.set(OfficeParserConfig.class, officeParserConfig);
assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xlsx", pc).xml);


//test configuring via config file
TikaConfig tikaConfig = new TikaConfig(OfficeParser.class.getResourceAsStream("tika-config-exclude-phonetic.xml"));
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
assertNotContained("\u65E5\u672C\u30AA\u30E9\u30AF\u30EB \u30CB\u30DB\u30F3",
getXML("testEXCEL_phonetic.xlsx", parser).xml);

}

}


Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"/>
<parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
<params>
<param name="concatenatePhoneticRuns" type="bool">false</param>
</params>
</parser>
<parser class="org.apache.tika.parser.microsoft.OfficeParser">
<params>
<param name="concatenatePhoneticRuns" type="bool">false</param>
</params>
</parser>
</parsers>
</properties>
Binary file not shown.
Binary file not shown.

0 comments on commit 74574e3

Please sign in to comment.