Skip to content

Commit

Permalink
LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
patrick-marty authored and bruno-roustant committed Jan 19, 2021
1 parent 9f5bdf4 commit 227256d
Show file tree
Hide file tree
Showing 10 changed files with 82 additions and 63 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ API Changes
* LUCENE-9317 LUCENE-9318 LUCENE-9319 LUCENE-9558 LUCENE-9600 : Clean up package name conflicts
between modules. See MIGRATE.md for details. (David Ryan, Tomoko Uchida, Uwe Schindler, Dawid Weiss)

* LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant)

Improvements

* LUCENE-9665 LUCENE-9676 Hunspell improvements: support default encoding, improve stemming of all-caps words
Expand Down
5 changes: 5 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ NativeUnixDirectory in the misc module was therefore removed and replaced
by DirectIODirectory. To use it, you need a JVM and operating system that
supports Direct IO.

## BM25Similarity.setDiscountOverlaps and LegacyBM25Similarity.setDiscountOverlaps methods removed (LUCENE-9646)

The discount discountOverlaps parameter for both BM25Similarity and LegacyBM25Similarity
is now set by the constructor of those classes.

## Packages in misc module are renamed (LUCENE-9600)

Following package names in misc module are renamed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,19 @@
public class BM25Similarity extends Similarity {
private final float k1;
private final float b;
private final boolean discountOverlaps;

/**
* BM25 with the supplied parameter values.
*
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree document length normalizes tf values.
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
* are discounted from the document's length.
* @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not
* within the range {@code [0..1]}
*/
public BM25Similarity(float k1, float b) {
public BM25Similarity(float k1, float b, boolean discountOverlaps) {
if (Float.isFinite(k1) == false || k1 < 0) {
throw new IllegalArgumentException(
"illegal k1 value: " + k1 + ", must be a non-negative finite value");
Expand All @@ -53,6 +56,19 @@ public BM25Similarity(float k1, float b) {
}
this.k1 = k1;
this.b = b;
this.discountOverlaps = discountOverlaps;
}

/**
* BM25 with the supplied parameter values.
*
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree document length normalizes tf values.
* @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not
* within the range {@code [0..1]}
*/
public BM25Similarity(float k1, float b) {
this(k1, b, true);
}

/**
Expand All @@ -62,9 +78,27 @@ public BM25Similarity(float k1, float b) {
* <li>{@code k1 = 1.2}
* <li>{@code b = 0.75}
* </ul>
*
* and the supplied parameter value:
*
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
* are discounted from the document's length.
*/
public BM25Similarity(boolean discountOverlaps) {
this(1.2f, 0.75f, discountOverlaps);
}

/**
* BM25 with these default values:
*
* <ul>
* <li>{@code k1 = 1.2}
* <li>{@code b = 0.75}
* <li>{@code discountOverlaps = true}
* </ul>
*/
public BM25Similarity() {
this(1.2f, 0.75f);
this(1.2f, 0.75f, true);
}

/** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
Expand All @@ -82,24 +116,10 @@ protected float avgFieldLength(CollectionStatistics collectionStats) {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}

/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
protected boolean discountOverlaps = true;

/**
* Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing norm.
* By default this is true, meaning overlap tokens do not count when computing norms.
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
* @see #BM25Similarity(float, float, boolean)
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ public void testPhraseScoreIsEqualToBoost() throws IOException {
public void testSameNormsAsBM25() {
BooleanSimilarity sim1 = new BooleanSimilarity();
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(true);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ public void testSaneNormValues() throws IOException {
public void testSameNormsAsBM25() {
ClassicSimilarity sim1 = new ClassicSimilarity();
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(true);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 1000);
final int position = random().nextInt(length);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -512,17 +512,16 @@ public void tearDown() throws Exception {

// LUCENE-5221
public void testDiscountOverlapsBoost() throws IOException {
BM25Similarity expected = new BM25Similarity();
BM25Similarity expected = new BM25Similarity(false);
SimilarityBase actual =
new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
expected.setDiscountOverlaps(false);
actual.setDiscountOverlaps(false);
FieldInvertState state =
new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
state.setLength(5);
state.setNumOverlap(2);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
expected.setDiscountOverlaps(true);
expected = new BM25Similarity();
actual.setDiscountOverlaps(true);
assertEquals(expected.computeNorm(state), actual.computeNorm(state));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ private Similarity createSimilarity(SimilarityConfig config) {
tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
similarity = tfidf;
} else {
BM25Similarity bm25 = new BM25Similarity(config.getK1(), config.getB());
bm25.setDiscountOverlaps(config.isDiscountOverlaps());
BM25Similarity bm25 =
new BM25Similarity(config.getK1(), config.getB(), config.isDiscountOverlaps());
similarity = bm25;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public final class LegacyBM25Similarity extends Similarity {
* <ul>
* <li>{@code k1 = 1.2}
* <li>{@code b = 0.75}
* <li>{@code discountOverlaps = true}
* </ul>
*/
public LegacyBM25Similarity() {
Expand All @@ -58,6 +59,20 @@ public LegacyBM25Similarity(float k1, float b) {
this.bm25Similarity = new BM25Similarity(k1, b);
}

/**
* BM25 with the supplied parameter values.
*
* @param k1 Controls non-linear term frequency normalization (saturation).
* @param b Controls to what degree document length normalizes tf values.
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
* are discounted from the document's length.
* @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is not
* within the range {@code [0..1]}
*/
public LegacyBM25Similarity(float k1, float b, boolean discountOverlaps) {
this.bm25Similarity = new BM25Similarity(k1, b, discountOverlaps);
}

@Override
public long computeNorm(FieldInvertState state) {
return bm25Similarity.computeNorm(state);
Expand Down Expand Up @@ -87,18 +102,10 @@ public final float getB() {
return bm25Similarity.getB();
}

/**
* Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing norm.
* By default this is true, meaning overlap tokens do not count when computing norms.
*/
public void setDiscountOverlaps(boolean v) {
bm25Similarity.setDiscountOverlaps(v);
}

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
* @see #LegacyBM25Similarity(float, float, boolean)
*/
public boolean getDiscountOverlaps() {
return bm25Similarity.getDiscountOverlaps();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,33 +32,27 @@
* The default is <code>1.2</code>
* <li>b (float): Controls to what degree document length normalizes tf values.
* The default is <code>0.75</code>
* </ul>
* <p>
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets
* {@link BM25Similarity#setDiscountOverlaps(boolean)}</li>
* <li>discountOverlaps (bool): True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
* The default is <code>true</code>
* </ul>
* @lucene.experimental
* @since 8.0.0
*/
public class BM25SimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
private float k1;
private float b;
private BM25Similarity similarity;

@Override
public void init(SolrParams params) {
super.init(params);
discountOverlaps = params.getBool("discountOverlaps", true);
k1 = params.getFloat("k1", 1.2f);
b = params.getFloat("b", 0.75f);
boolean discountOverlaps = params.getBool("discountOverlaps", true);
float k1 = params.getFloat("k1", 1.2f);
float b = params.getFloat("b", 0.75f);
similarity = new BM25Similarity(k1, b, discountOverlaps);
}

@Override
public Similarity getSimilarity() {
BM25Similarity sim = new BM25Similarity(k1, b);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
return similarity;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,33 +32,27 @@
* The default is <code>1.2</code>
* <li>b (float): Controls to what degree document length normalizes tf values.
* The default is <code>0.75</code>
* </ul>
* <p>
* Optional settings:
* <ul>
* <li>discountOverlaps (bool): Sets
* {@link LegacyBM25Similarity#setDiscountOverlaps(boolean)}</li>
* <li>discountOverlaps (bool): True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
* The default is <code>true</code>
* </ul>
* @lucene.experimental
* @since 8.0.0
*/
public class LegacyBM25SimilarityFactory extends SimilarityFactory {
private boolean discountOverlaps;
private float k1;
private float b;
private LegacyBM25Similarity similarity;

@Override
public void init(SolrParams params) {
super.init(params);
discountOverlaps = params.getBool("discountOverlaps", true);
k1 = params.getFloat("k1", 1.2f);
b = params.getFloat("b", 0.75f);
boolean discountOverlaps = params.getBool("discountOverlaps", true);
float k1 = params.getFloat("k1", 1.2f);
float b = params.getFloat("b", 0.75f);
similarity = new LegacyBM25Similarity(k1, b, discountOverlaps);
}

@Override
public Similarity getSimilarity() {
LegacyBM25Similarity sim = new LegacyBM25Similarity(k1, b);
sim.setDiscountOverlaps(discountOverlaps);
return sim;
return similarity;
}
}

0 comments on commit 227256d

Please sign in to comment.