Skip to content

Commit

Permalink
TIKA-3091 prevent npe in PDFParserConfig by initializing
Browse files Browse the repository at this point in the history
three parameters with default values.
  • Loading branch information
tballison committed Apr 14, 2020
1 parent e479995 commit d811de9
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,17 @@ void setExtractInlineImages(boolean extractInlineImages) {
defaultConfig.setExtractInlineImages(extractInlineImages);
}

@Field
void setAverageCharTolerance(float averageCharTolerance) {
defaultConfig.setAverageCharTolerance(averageCharTolerance);
}

@Field
void setSpacingTolerance(float spacingTolerance) {
defaultConfig.setSpacingTolerance(spacingTolerance);
}


@Field
void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,16 @@ private static OCR_STRATEGY parse(String s) {
private boolean extractMarkedContent = false;

//The character width-based tolerance value used to estimate where spaces in text should be added
private Float averageCharTolerance;
//Default taken from PDFBox.
private Float averageCharTolerance = 0.5f;

//The space width-based tolerance value used to estimate where spaces in text should be added
private Float spacingTolerance;
//Default taken from PDFBox.
private Float spacingTolerance = 0.3f;

// The multiplication factor for line height to decide when a new paragraph starts.
private float dropThreshold;
//Default taken from PDFBox.
private float dropThreshold = 2.5f;

//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
Expand Down Expand Up @@ -235,6 +238,10 @@ private void init(InputStream is) {

setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));

setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));

boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,16 @@ public void testUnmappedUnicodeStats() throws Exception {

}

@Test
public void testNPEInPDFParserConfig() {
//TIKA-3091
PDFParserConfig config = new PDFParserConfig();
//don't care about values; want to make sure no NPE is thrown
String txt = config.toString();
config.hashCode();
config.equals(new PDFParserConfig());
}

@Test //TIKA-3041
@Ignore("turn back on if we add file from PDFBOX-52")
public void testPDFBox52() throws Exception {
Expand Down

0 comments on commit d811de9

Please sign in to comment.