-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfuzzy_match.c
158 lines (137 loc) · 3.83 KB
/
fuzzy_match.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/*
* Copyright (C) 2022 Philip Jones
*
* Licensed under the MIT License.
* https://opensource.org/licenses/MIT
*/
#define _GNU_SOURCE /* Required for strcasestr. */
#include <ctype.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "fuzzy_match.h"
#undef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
static int32_t compute_score(
int32_t jump,
bool first_char,
const char *restrict match);
static int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_char);
/*
* Returns score if each character in pattern is found sequentially within str.
* Returns INT32_MIN otherwise.
*/
int32_t fuzzy_match(const char *restrict pattern, const char *restrict str)
{
const int unmatched_letter_penalty = -1;
const size_t slen = strlen(str);
const size_t plen = strlen(pattern);
int32_t score = 100;
if (*pattern == '\0') {
return score;
}
if (slen < plen) {
return INT32_MIN;
}
/* We can already penalise any unused letters. */
score += unmatched_letter_penalty * (int32_t)(slen - plen);
/* Perform the match. */
score = fuzzy_match_recurse(pattern, str, score, true);
return score;
}
/*
* Recursively match the whole of pattern against str.
* The score parameter is the score of the previously matched character.
*
* This reaches a maximum recursion depth of strlen(pattern) + 1. However, the
* stack usage is small (the maximum I've seen on x86_64 is 144 bytes with
* gcc -O3), so this shouldn't matter unless pattern contains thousands of
* characters.
*/
int32_t fuzzy_match_recurse(
const char *restrict pattern,
const char *restrict str,
int32_t score,
bool first_char)
{
if (*pattern == '\0') {
/* We've matched the full pattern. */
return score;
}
const char *match = str;
const char search[2] = { *pattern, '\0' };
int32_t best_score = INT32_MIN;
/*
* Find all occurrences of the next pattern character in str, and
* recurse on them.
*/
while ((match = strcasestr(match, search)) != NULL) {
int32_t subscore = fuzzy_match_recurse(
pattern + 1,
match + 1,
compute_score(match - str, first_char, match),
false);
best_score = MAX(best_score, subscore);
match++;
}
if (best_score == INT32_MIN) {
/* We couldn't match the rest of the pattern. */
return INT32_MIN;
} else {
return score + best_score;
}
}
/*
* Calculate the score for a single matching letter.
* The scoring system is taken from fts_fuzzy_match v0.2.0 by Forrest Smith,
* which is licensed to the public domain.
*
* The factors affecting score are:
* - Bonuses:
* - If there are multiple adjacent matches.
* - If a match occurs after a separator character.
* - If a match is uppercase, and the previous character is lowercase.
*
* - Penalties:
* - If there are letters before the first match.
* - If there are superfluous characters in str (already accounted for).
*/
int32_t compute_score(int32_t jump, bool first_char, const char *restrict match)
{
const int adjacency_bonus = 15;
const int separator_bonus = 30;
const int camel_bonus = 30;
const int first_letter_bonus = 15;
const int leading_letter_penalty = -5;
const int max_leading_letter_penalty = -15;
int32_t score = 0;
/* Apply bonuses. */
if (!first_char && jump == 0) {
score += adjacency_bonus;
}
if (!first_char || jump > 0) {
if (isupper((unsigned char)*match)
&& islower((unsigned char)*(match - 1))) {
score += camel_bonus;
}
if (isalnum((unsigned char)*match)
&& !isalnum((unsigned char)*(match - 1))) {
score += separator_bonus;
}
}
if (first_char && jump == 0) {
/* Match at start of string gets separator bonus. */
score += first_letter_bonus;
}
/* Apply penalties. */
if (first_char) {
score += MAX(leading_letter_penalty * jump,
max_leading_letter_penalty);
}
return score;
}