Skip to content

Commit

Permalink
remove fuzziness from number segments in SingleError mode. close #50.
Browse files Browse the repository at this point in the history
  • Loading branch information
leeoniya committed Dec 9, 2023
1 parent 2bfa63f commit 3b861dc
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 263 deletions.
9 changes: 8 additions & 1 deletion demos/testdata.json
Original file line number Diff line number Diff line change
Expand Up @@ -161900,6 +161900,13 @@
"который",
"Alle må holde i et tau og være sikret slik at de er trygge. Etter isbreen må de gå i en ganske stor steinrøys og så er de endelig fremme.",
"interface-id-face-scan-2-identification-angle-secure-human-id-person-face-security-brackets",
"Sabine State Bank and Trust Company"
"Sabine State Bank and Trust Company",
"abc1234",
"abc2134",
"ab1c234",
"abc 1234",
"abc123acb",
"abc123acb supper",
"1234"
]
}
143 changes: 78 additions & 65 deletions dist/uFuzzy.cjs.js
Original file line number Diff line number Diff line change
Expand Up @@ -165,27 +165,30 @@ function uFuzzy(opts) {
_intraTrn = 0,
_intraDel = 0;

let plen = p.length;

// prevent junk matches by requiring stricter rules for short terms
if (plen <= 4) {
if (plen >= 3) {
// one swap in non-first char when 3-4 chars
_intraTrn = Math.min(intraTrn, 1);

// or one insertion when 4 chars
if (plen == 4)
_intraIns = Math.min(intraIns, 1);
// only-digits strings should match exactly, else special rules for short strings
if (/[^\d]/.test(p)) {
let plen = p.length;

// prevent junk matches by requiring stricter rules for short terms
if (plen <= 4) {
if (plen >= 3) {
// one swap in non-first char when 3-4 chars
_intraTrn = Math.min(intraTrn, 1);

// or one insertion when 4 chars
if (plen == 4)
_intraIns = Math.min(intraIns, 1);
}
// else exact match when 1-2 chars
}
// use supplied opts
else {
_intraSlice = intraSlice;
_intraIns = intraIns,
_intraSub = intraSub,
_intraTrn = intraTrn,
_intraDel = intraDel;
}
// else exact match when 1-2 chars
}
// use supplied opts
else {
_intraSlice = intraSlice;
_intraIns = intraIns,
_intraSub = intraSub,
_intraTrn = intraTrn,
_intraDel = intraDel;
}

return {
Expand Down Expand Up @@ -223,6 +226,8 @@ function uFuzzy(opts) {
return needle.split(interSplit).filter(t => t != '').map(v => v === EXACT_HERE ? exacts[j++] : v);
};

const NUM_OR_ALPHA_RE = /[^\d]+|\d+/g;

const prepQuery = (needle, capt = 0, interOR = false) => {
// split on punct, whitespace, num-alpha, and upper-lower boundaries
let parts = split(needle);
Expand All @@ -243,64 +248,72 @@ function uFuzzy(opts) {
// allows single mutations within each term
if (intraMode == 1) {
reTpl = parts.map((p, pi) => {
let {
intraSlice,
intraIns,
intraSub,
intraTrn,
intraDel,
} = intraRules(p);

if (intraIns + intraSub + intraTrn + intraDel == 0)
return p + contrs[pi];

if (p[0] === '"')
return escapeRegExp(p.slice(1, -1));

let [lftIdx, rgtIdx] = intraSlice;
let lftChar = p.slice(0, lftIdx); // prefix
let rgtChar = p.slice(rgtIdx); // suffix
let reTpl = '';

let chars = p.slice(lftIdx, rgtIdx);
// split into numeric and alpha parts, so numbers are only matched as following punct or alpha boundaries, without swaps or insertions
for (let m of p.matchAll(NUM_OR_ALPHA_RE)) {
let p = m[0];

// neg lookahead to prefer matching 'Test' instead of 'tTest' in ManifestTest or fittest
// but skip when search term contains leading repetition (aardvark, aaa)
if (intraIns == 1 && lftChar.length == 1 && lftChar != chars[0])
lftChar += '(?!' + lftChar + ')';
let {
intraSlice,
intraIns,
intraSub,
intraTrn,
intraDel,
} = intraRules(p);

let numChars = chars.length;
if (intraIns + intraSub + intraTrn + intraDel == 0)
reTpl += p + contrs[pi];
else {
let [lftIdx, rgtIdx] = intraSlice;
let lftChar = p.slice(0, lftIdx); // prefix
let rgtChar = p.slice(rgtIdx); // suffix

let variants = [p];
let chars = p.slice(lftIdx, rgtIdx);

// variants with single char substitutions
if (intraSub) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraChars + chars.slice(i + 1) + rgtChar);
}
// neg lookahead to prefer matching 'Test' instead of 'tTest' in ManifestTest or fittest
// but skip when search term contains leading repetition (aardvark, aaa)
if (intraIns == 1 && lftChar.length == 1 && lftChar != chars[0])
lftChar += '(?!' + lftChar + ')';

// variants with single transpositions
if (intraTrn) {
for (let i = 0; i < numChars - 1; i++) {
if (chars[i] != chars[i+1])
variants.push(lftChar + chars.slice(0, i) + chars[i+1] + chars[i] + chars.slice(i + 2) + rgtChar);
}
}
let numChars = chars.length;

// variants with single char omissions
if (intraDel) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i + 1) + '?' + chars.slice(i + 1) + rgtChar);
}
let variants = [p];

// variants with single char insertions
if (intraIns) {
let intraInsTpl = lazyRepeat(intraChars, 1);
// variants with single char substitutions
if (intraSub) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraChars + chars.slice(i + 1) + rgtChar);
}

for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraInsTpl + chars.slice(i) + rgtChar);
}
// variants with single transpositions
if (intraTrn) {
for (let i = 0; i < numChars - 1; i++) {
if (chars[i] != chars[i+1])
variants.push(lftChar + chars.slice(0, i) + chars[i+1] + chars[i] + chars.slice(i + 2) + rgtChar);
}
}

let reTpl = '(?:' + variants.join('|') + ')' + contrs[pi];
// variants with single char omissions
if (intraDel) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i + 1) + '?' + chars.slice(i + 1) + rgtChar);
}

// variants with single char insertions
if (intraIns) {
let intraInsTpl = lazyRepeat(intraChars, 1);

for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraInsTpl + chars.slice(i) + rgtChar);
}

reTpl += '(?:' + variants.join('|') + ')' + contrs[pi];
}
}

// console.log(reTpl);

Expand Down
143 changes: 78 additions & 65 deletions dist/uFuzzy.esm.js
Original file line number Diff line number Diff line change
Expand Up @@ -163,27 +163,30 @@ function uFuzzy(opts) {
_intraTrn = 0,
_intraDel = 0;

let plen = p.length;

// prevent junk matches by requiring stricter rules for short terms
if (plen <= 4) {
if (plen >= 3) {
// one swap in non-first char when 3-4 chars
_intraTrn = Math.min(intraTrn, 1);

// or one insertion when 4 chars
if (plen == 4)
_intraIns = Math.min(intraIns, 1);
// only-digits strings should match exactly, else special rules for short strings
if (/[^\d]/.test(p)) {
let plen = p.length;

// prevent junk matches by requiring stricter rules for short terms
if (plen <= 4) {
if (plen >= 3) {
// one swap in non-first char when 3-4 chars
_intraTrn = Math.min(intraTrn, 1);

// or one insertion when 4 chars
if (plen == 4)
_intraIns = Math.min(intraIns, 1);
}
// else exact match when 1-2 chars
}
// use supplied opts
else {
_intraSlice = intraSlice;
_intraIns = intraIns,
_intraSub = intraSub,
_intraTrn = intraTrn,
_intraDel = intraDel;
}
// else exact match when 1-2 chars
}
// use supplied opts
else {
_intraSlice = intraSlice;
_intraIns = intraIns,
_intraSub = intraSub,
_intraTrn = intraTrn,
_intraDel = intraDel;
}

return {
Expand Down Expand Up @@ -221,6 +224,8 @@ function uFuzzy(opts) {
return needle.split(interSplit).filter(t => t != '').map(v => v === EXACT_HERE ? exacts[j++] : v);
};

const NUM_OR_ALPHA_RE = /[^\d]+|\d+/g;

const prepQuery = (needle, capt = 0, interOR = false) => {
// split on punct, whitespace, num-alpha, and upper-lower boundaries
let parts = split(needle);
Expand All @@ -241,64 +246,72 @@ function uFuzzy(opts) {
// allows single mutations within each term
if (intraMode == 1) {
reTpl = parts.map((p, pi) => {
let {
intraSlice,
intraIns,
intraSub,
intraTrn,
intraDel,
} = intraRules(p);

if (intraIns + intraSub + intraTrn + intraDel == 0)
return p + contrs[pi];

if (p[0] === '"')
return escapeRegExp(p.slice(1, -1));

let [lftIdx, rgtIdx] = intraSlice;
let lftChar = p.slice(0, lftIdx); // prefix
let rgtChar = p.slice(rgtIdx); // suffix
let reTpl = '';

let chars = p.slice(lftIdx, rgtIdx);
// split into numeric and alpha parts, so numbers are only matched as following punct or alpha boundaries, without swaps or insertions
for (let m of p.matchAll(NUM_OR_ALPHA_RE)) {
let p = m[0];

// neg lookahead to prefer matching 'Test' instead of 'tTest' in ManifestTest or fittest
// but skip when search term contains leading repetition (aardvark, aaa)
if (intraIns == 1 && lftChar.length == 1 && lftChar != chars[0])
lftChar += '(?!' + lftChar + ')';
let {
intraSlice,
intraIns,
intraSub,
intraTrn,
intraDel,
} = intraRules(p);

let numChars = chars.length;
if (intraIns + intraSub + intraTrn + intraDel == 0)
reTpl += p + contrs[pi];
else {
let [lftIdx, rgtIdx] = intraSlice;
let lftChar = p.slice(0, lftIdx); // prefix
let rgtChar = p.slice(rgtIdx); // suffix

let variants = [p];
let chars = p.slice(lftIdx, rgtIdx);

// variants with single char substitutions
if (intraSub) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraChars + chars.slice(i + 1) + rgtChar);
}
// neg lookahead to prefer matching 'Test' instead of 'tTest' in ManifestTest or fittest
// but skip when search term contains leading repetition (aardvark, aaa)
if (intraIns == 1 && lftChar.length == 1 && lftChar != chars[0])
lftChar += '(?!' + lftChar + ')';

// variants with single transpositions
if (intraTrn) {
for (let i = 0; i < numChars - 1; i++) {
if (chars[i] != chars[i+1])
variants.push(lftChar + chars.slice(0, i) + chars[i+1] + chars[i] + chars.slice(i + 2) + rgtChar);
}
}
let numChars = chars.length;

// variants with single char omissions
if (intraDel) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i + 1) + '?' + chars.slice(i + 1) + rgtChar);
}
let variants = [p];

// variants with single char insertions
if (intraIns) {
let intraInsTpl = lazyRepeat(intraChars, 1);
// variants with single char substitutions
if (intraSub) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraChars + chars.slice(i + 1) + rgtChar);
}

for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraInsTpl + chars.slice(i) + rgtChar);
}
// variants with single transpositions
if (intraTrn) {
for (let i = 0; i < numChars - 1; i++) {
if (chars[i] != chars[i+1])
variants.push(lftChar + chars.slice(0, i) + chars[i+1] + chars[i] + chars.slice(i + 2) + rgtChar);
}
}

let reTpl = '(?:' + variants.join('|') + ')' + contrs[pi];
// variants with single char omissions
if (intraDel) {
for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i + 1) + '?' + chars.slice(i + 1) + rgtChar);
}

// variants with single char insertions
if (intraIns) {
let intraInsTpl = lazyRepeat(intraChars, 1);

for (let i = 0; i < numChars; i++)
variants.push(lftChar + chars.slice(0, i) + intraInsTpl + chars.slice(i) + rgtChar);
}

reTpl += '(?:' + variants.join('|') + ')' + contrs[pi];
}
}

// console.log(reTpl);

Expand Down
Loading

0 comments on commit 3b861dc

Please sign in to comment.