Skip to content

Commit

Permalink
TIKA-2807 -- extract sdt content from within textbox in docx
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Jan 7, 2019
1 parent 75c8b9f commit 06cf66c
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Release 2.0.0 - ???


Release 1.21 - ????
* Extract text from SDT element within textboxes in .docx files (TIKA-2807).

* Try to handle truncated OOXML files more robustly (TIKA-2765).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,11 @@ private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManag
xhtml.characters(footnameText + "\n");
}

// Also extract any paragraphs embedded in text boxes:
// Also extract any paragraphs embedded in text boxes
//Note "w:txbxContent//"...must look for all descendant paragraphs
//not just the immediate children of txbxContent -- TIKA-2807
if (config.getIncludeShapeBasedContent()) {
for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,15 @@ public void testTextInsideTextBox() throws Exception {
assertContains("This text is inside of a text box in the footer of the document.", xml);
}

//TIKA-2807
@Test
public void testSDTInTextBox() throws Exception {
String xml = getXML("testWORD_sdtInTextBox.docx").xml;
System.out.println(xml);
assertContains("rich-text-content-control_inside-text-box", xml);
assertContainsCount("inside-text", xml, 1);
}

//TIKA-2346
@Test
public void testTurningOffTextBoxExtraction() throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -860,5 +860,12 @@ public void testTextDecorationNested() throws Exception {
assertNotContained("unde ", txt);
}

//TIKA-2807
@Test
public void testSDTInTextBox() throws Exception {
String xml = getXML("testWORD_sdtInTextBox.docx", parseContext).xml;
assertContains("rich-text-content-control_inside-text-box", xml);
assertContainsCount("inside-text", xml, 1);
}

}
Binary file not shown.

0 comments on commit 06cf66c

Please sign in to comment.