Skip to content

Commit

Permalink
TIKA-1476 - Added tests for TesseractOCRConfig external configuration…
Browse files Browse the repository at this point in the history
… through properties files

git-svn-id: https://svn.apache.org/repos/asf/tika/trunk@1640068 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
dameikle committed Nov 17, 2014
1 parent 3b2fba4 commit 069ac2d
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;

import org.apache.tika.TikaTest;
import org.junit.Test;

import java.io.InputStream;

import static org.junit.Assert.assertEquals;

public class TesseractOCRConfigTest extends TikaTest {

@Test
public void testNoConfig() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
assertEquals("Invalid default language value", "eng", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
assertEquals("Invalid default timeout value", 120, config.getTimeout());
}

@Test
public void testPartialConfig() throws Exception {

InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
"/test-properties/TesseractOCRConfig-partial.properties");

TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
}

@Test
public void testFullConfig() throws Exception {

InputStream stream = TesseractOCRConfigTest.class.getResourceAsStream(
"/test-properties/TesseractOCRConfig-full.properties");

TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract/", config.getTesseractPath());
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

tesseractPath=/opt/tesseract
language=fra+deu
pageSegMode=2
maxFileSizeToOcr=2000000
timeout=240
minFileSizeToOcr=1
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

language=fra+deu
timeout=240
minFileSizeToOcr=1

0 comments on commit 069ac2d

Please sign in to comment.