Skip to content

Commit

Permalink
TIKA-2906 -- cleanup and bug fix on entropy
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Aug 16, 2019
1 parent dad1c72 commit e287fe2
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 11 deletions.
13 changes: 3 additions & 10 deletions tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import org.apache.tika.eval.tokens.TokenIntPair;
import org.apache.tika.eval.util.ContentTagParser;
import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.eval.util.EvalExceptionUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
Expand Down Expand Up @@ -183,10 +184,6 @@ public enum PARSE_ERROR_TYPE {
int maxTokens = 200000;


//these remove runtime info from the stacktraces so
//that actual causes can be counted.
private final static Pattern CAUSED_BY_SNIPPER =
Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");

private final static Pattern ACCESS_PERMISSION_EXCEPTION =
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
Expand Down Expand Up @@ -495,7 +492,7 @@ int countMetadataValues(Metadata m) {

void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {

String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime");
String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);

if (fullTrace == null) {
fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
Expand Down Expand Up @@ -528,11 +525,7 @@ void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
//IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
//For reporting purposes, let's snip off the object id so that we can more
//easily count exceptions.
String sortTrace = ExceptionUtils.trimMessage(fullTrace);

matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
sortTrace = matcher.replaceAll("$1");
sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
String sortTrace = EvalExceptionUtils.normalize(fullTrace);
data.put(Cols.SORT_STACK_TRACE, sortTrace);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ public Double calculate(TokenCounts tokenCounts) {
p = (double) termFreq / totalTokens;
ent += p * FastMath.log(base, p);
}
return ent;
return -1.0*ent;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.tika.utils.ExceptionUtils;

public class EvalExceptionUtils {

//these remove runtime info from the stacktraces so
//that actual causes can be counted.
private final static Pattern CAUSED_BY_SNIPPER =
Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");

public static String normalize(String stacktrace) {
if (StringUtils.isBlank(stacktrace)) {
return "";
}
String sortTrace = ExceptionUtils.trimMessage(stacktrace);

Matcher matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
sortTrace = matcher.replaceAll("$1");
return sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
}
}

0 comments on commit e287fe2

Please sign in to comment.