Skip to content

Commit

Permalink
TIKA-3641 -- upgrade Lucene to 9.x now that we're on Java 11 (apache#…
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison authored Sep 13, 2023
1 parent 4bba971 commit 0960d74
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 21 deletions.
4 changes: 2 additions & 2 deletions tika-eval/tika-eval-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@
<exclude>org.apache.commons:commons-lang3:jar:</exclude>
<exclude>org.apache.commons:commons-math3:jar:</exclude>
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
<exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
<exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@
import java.util.concurrent.ConcurrentHashMap;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.ByteVectorValues;
import org.apache.lucene.index.CompositeReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafMetaData;
import org.apache.lucene.index.LeafReader;
Expand All @@ -45,7 +47,10 @@
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;

Expand Down Expand Up @@ -264,12 +269,43 @@ public NumericDocValues getNormValues(String field) throws IOException {
return MultiDocValues.getNormValues(in, field); // TODO cache?
}

@Override
public FloatVectorValues getFloatVectorValues(String s) throws IOException {
//TODO figure out how to implement this... if needed
return null;
}

@Override
public ByteVectorValues getByteVectorValues(String s) throws IOException {
//TODO figure out how to implement this... if needed
return null;
}

@Override
public TopDocs searchNearestVectors(String s, float[] floats, int i, Bits bits, int i1)
throws IOException {
//TODO figure out how to implement this... if needed
return null;
}

@Override
public TopDocs searchNearestVectors(String s, byte[] bytes, int i, Bits bits, int i1)
throws IOException {
//TODO figure out how to implement this... if needed
return null;
}

@Override
public Fields getTermVectors(int docID) throws IOException {
ensureOpen();
return in.getTermVectors(docID);
}

@Override
public TermVectors termVectors() throws IOException {
return in.termVectors();
}

@Override
public int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
Expand All @@ -288,6 +324,11 @@ public void document(int docID, StoredFieldVisitor visitor) throws IOException {
in.document(docID, visitor);
}

@Override
public StoredFields storedFields() throws IOException {
return in.storedFields();
}

@Override
public Bits getLiveDocs() {
ensureOpen();
Expand Down
4 changes: 2 additions & 2 deletions tika-eval/tika-eval-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-icu</artifactId>
<artifactId>lucene-analysis-icu</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,26 @@
import java.util.Map;

import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;

/**
* Factory for filter that only allows tokens with characters that "isAlphabetic" or "isIdeographic" through.
*/
public class AlphaIdeographFilterFactory extends TokenFilterFactory {

public static final String NAME = "alphaIdeograph";

private static final int UNDERSCORE = (int) '_';


public AlphaIdeographFilterFactory() {
super();
}

public AlphaIdeographFilterFactory(Map<String, String> args) {
super(args);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.ClasspathResourceLoader;

class AnalyzerDeserializer {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
import java.util.Map;

import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;

/**
* Creates a very narrowly focused TokenFilter that limits tokens based on length
Expand All @@ -35,9 +35,15 @@
*/
public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory {

public static final String NAME = "cjkBigramAwareLength";


private final int min;
private final int max;
public CJKBigramAwareLengthFilterFactory() {
min = 3;
max = 20;
}

public CJKBigramAwareLengthFilterFactory(Map<String, String> args) {
super(args);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@
import java.util.Map;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.email.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.TokenFilterFactory;

/**
* Factory for filter that normalizes urls and emails to __url__ and __email__
Expand All @@ -35,11 +35,16 @@
*/
public class URLEmailNormalizingFilterFactory extends TokenFilterFactory {

public static final String NAME = "urlEmailNormalizing";

public static final String URL = "___url___";
public static final String EMAIL = "___email___";
private static final char[] URL_CHARS = URL.toCharArray();
private static final char[] EMAIL_CHARS = EMAIL.toCharArray();

public URLEmailNormalizingFilterFactory() {
super();
}
public URLEmailNormalizingFilterFactory(Map<String, String> args) {
super(args);
}
Expand Down
12 changes: 4 additions & 8 deletions tika-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,7 @@
<guava.version>32.1.2-jre</guava.version>
<httpcomponents.version>4.5.14</httpcomponents.version>
<httpcore.version>4.4.16</httpcore.version>
<!-- versions greater than 62.2 are not compatible with icu4j handler in lucene 8.11.1 -->
<icu4j.version>62.2</icu4j.version>
<icu4j.version>73.2</icu4j.version>
<imageio.version>1.4.0</imageio.version>
<jackrabbit.version>2.21.19</jackrabbit.version>
<jackson.version>2.15.2</jackson.version>
Expand All @@ -361,8 +360,7 @@
<libpst.version>0.9.3</libpst.version>
<log4j2.version>2.20.0</log4j2.version>
<lombok.version>1.18.20</lombok.version>
<!-- lucene >= 9.x requires Java 11 -->
<lucene.version>8.11.2</lucene.version>
<lucene.version>9.7.0</lucene.version>
<metadata.extractor.version>2.18.0</metadata.extractor.version>
<microsoft.translator.version>0.6.2</microsoft.translator.version>
<mime4j.version>0.8.9</mime4j.version>
Expand Down Expand Up @@ -773,12 +771,12 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-icu</artifactId>
<artifactId>lucene-analysis-icu</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
Expand Down Expand Up @@ -1010,8 +1008,6 @@
<version>1.53.0</version>
</dependency>
<exclude>
<!-- CVE-2018-18928 does affect the java library not just the c/c++ library,
upon further research -->
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${icu4j.version}</version>
Expand Down
4 changes: 2 additions & 2 deletions tika-server/tika-server-eval/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@
<exclude>org.apache.commons:commons-lang3:jar:</exclude>
<exclude>org.apache.commons:commons-math3:jar:</exclude>
<exclude>org.apache.lucene:lucene-core:jar:</exclude>
<exclude>org.apache.lucene:lucene-analyzers-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analyzers-icu:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-common:jar:</exclude>
<exclude>org.apache.lucene:lucene-analysis-icu:jar:</exclude>
<exclude>org.ccil.cowan.tagsoup:tagsoup:jar:</exclude>
<exclude>com.ibm.icu:icu4j:jar:</exclude>
<exclude>com.fasterxml.jackson.core:jackson-core:jar:</exclude>
Expand Down

0 comments on commit 0960d74

Please sign in to comment.