From fbaf43c5e686c3510228c8761885aaa78e70c37a Mon Sep 17 00:00:00 2001 From: Terance Edmonds Date: Tue, 1 Aug 2023 22:36:56 +0530 Subject: [PATCH 1/3] alternatives search algo --- .gitignore | 2 ++ regex.py | 79 +++++++++++++++++++++++++++++++++++------------------- test.py | 16 +++++------ 3 files changed, 61 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index 9460d9f..9b9fd10 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.example.py *.pdf *.backup.py +backups +backups/* __pycache__ \ No newline at end of file diff --git a/regex.py b/regex.py index 92a733a..2d2300c 100644 --- a/regex.py +++ b/regex.py @@ -2,11 +2,6 @@ def is_start_set(char): return char == '[' or char == '(' -def match_star(start, end, txt): - # for i in range(len(txt) + 1): - return [True] - - # extract set from expression def extract_set(char, exp): end_pos = 0 @@ -20,60 +15,86 @@ def extract_set(char, exp): # match range set def match_range(exp, txt, pos = 0): - if(len(txt) == 0): - return [True, pos] - + if(len(txt) == 0 or len(txt) - 1 == pos): + if(len(txt) - 1 == pos): + return [True, pos - 1] + else: + return [True, pos] + # if string contains lowercase letters if('a-z' in exp): - if(txt[0] >= 'a' and txt[0] <= 'z'): - return match_range(exp, txt[1:], pos + 1) + if(txt[pos] >= 'a' and txt[pos] <= 'z'): + return match_range(exp, txt, pos + 1) # if string contains uppercase letters if('A-Z' in exp): - if(txt[0] >= 'A' and txt[0] <= 'Z'): - return match_range(exp, txt[1:], pos + 1) + if(txt[pos] >= 'A' and txt[pos] <= 'Z'): + return match_range(exp, txt, pos + 1) # if string contains integers if('0-9' in exp): - if(txt[0] >= '0' and txt[0] <= '9'): - return match_range(exp, txt[1:], pos + 1) + if(txt[pos] >= '0' and txt[pos] <= '9'): + return match_range(exp, txt, pos + 1) return [False, pos] # match options set -def match_set(exp, txt): +def match_set(exp, txt, pos = 0, end = 0): + if(end == len(txt)): + return [True, end] + arr = exp.replace('(', '').replace(')', '').split('|') - return [txt in arr] + char = txt[end] + + + if any(char in s for s in arr): + [matched, end] = match_set(exp, txt, pos, end + 1) + + if(matched): + for item in arr: + if(item == txt[pos:end]): + [matched, txt_pos] = match_exp(item, txt[pos:end], 0, 0) + + if (matched): + return [True, end] + else: + return [False, pos + txt_pos] + + return [False, pos] # find match of the expression in the text def match_exp(exp, txt, txt_pos = 0, exp_pos = 0): # if expression is empty - we have checked all if (len(exp) == 0): - return True + return [True, txt_pos] # if match set of characters if(is_start_set(exp[exp_pos])): [set_exp, exp_pos] = extract_set(exp[exp_pos], exp) if(exp[0] == '['): - [matched, txt_pos] = match_range(set_exp, txt) + [matched, txt_pos] = match_range(set_exp, txt, txt_pos) + if(matched): - return True + return [True, txt_pos] elif(exp[0] == '('): - [matched] = match_set(set_exp, txt) + [matched, txt_pos] = match_set(set_exp, txt, txt_pos, txt_pos) + if(matched): - return True - + return [True, txt_pos] + # if character matches if(len(exp) > exp_pos and len(txt) > txt_pos): if (exp[exp_pos] == txt[txt_pos]): - if(match_exp(exp[(exp_pos + 1):], txt[(txt_pos + 1):])): - return True + [matched, txt_pos] = match_exp(exp[(exp_pos + 1):], txt, txt_pos + 1) + + if(matched): + return [True, txt_pos] # if nothing matches - return False + return [False, txt_pos] # if valid to start def is_valid(exp, txt): @@ -88,7 +109,9 @@ def init_match(exp, txt): if (is_valid(exp, txt)): # naive algorithm while txt_pos < len(txt) - 1: - if (match_exp(exp, txt[txt_pos:])): + [matched, txt_pos] = match_exp(exp, txt, txt_pos) + + if (matched): matched_count += 1 elif(is_start_set(exp[0])): @@ -98,9 +121,9 @@ def init_match(exp, txt): # if the matched count is greater than zero if (matched_count > 0): - return True + return [True, matched_count] - return False + return [False, 0] class RegEx: diff --git a/test.py b/test.py index 5fbafde..14fe91a 100644 --- a/test.py +++ b/test.py @@ -3,9 +3,12 @@ # run each test case def test(num, exp, txt): re = RegEx(exp) + result = re.match(txt) + print("Pattern: ", exp) print("String: ", txt) - print(f"Test {num} result: ", re.match(txt), end='\n\n') + print(f"Test {num} result: ", result[0]) + print(f"Match count: ", result[1], end='\n\n') if __name__ == '__main__': @@ -32,15 +35,12 @@ def test(num, exp, txt): 'string': 'hello99@gmail.com', }, { - 'pattern': '[A-Z]', - 'string': 'Hello', + 'pattern': '[a-z0-9]@[a-z].(moc)', + 'string': 'hello99@gmail.com', }, - ] - - tests_1 = [ { - 'pattern': '[a-z0-9]@[a-z].(com|net|org)', - 'string': 'hello99@mail.net', + 'pattern': '[A-Z]', + 'string': 'Hello', }, ] From af9d1847af2912b04c68eae33ef70eb2fe0502d5 Mon Sep 17 00:00:00 2001 From: Terance Edmonds Date: Tue, 1 Aug 2023 22:38:47 +0530 Subject: [PATCH 2/3] rename --- test.py => tests.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test.py => tests.py (100%) diff --git a/test.py b/tests.py similarity index 100% rename from test.py rename to tests.py From d48a36b382bb6d94259b2ffc5051dbb6310e0206 Mon Sep 17 00:00:00 2001 From: Terance Edmonds Date: Fri, 4 Aug 2023 09:57:46 +0530 Subject: [PATCH 3/3] start with --- ReadMe.md | 1 + regex.py | 62 +++++++++++++++++++++++++++++++++---------------------- tests.py | 15 +++++++++++++- 3 files changed, 52 insertions(+), 26 deletions(-) diff --git a/ReadMe.md b/ReadMe.md index 7195944..426fe54 100644 --- a/ReadMe.md +++ b/ReadMe.md @@ -6,6 +6,7 @@ It supports: - Literals ( abc ) - Ranges ( [a-z], [A-Z], [0-9], [a-zA-Z0-9] ) - Alternatives ( a|b ) + - Start ( ^[0-9], ^H ) - Matching in the middle of a string ## Usage diff --git a/regex.py b/regex.py index 2d2300c..3e69325 100644 --- a/regex.py +++ b/regex.py @@ -2,6 +2,10 @@ def is_start_set(char): return char == '[' or char == '(' +# is the character is to check starting character +def is_start(char): + return char == '^' + # extract set from expression def extract_set(char, exp): end_pos = 0 @@ -15,12 +19,11 @@ def extract_set(char, exp): # match range set def match_range(exp, txt, pos = 0): - if(len(txt) == 0 or len(txt) - 1 == pos): - if(len(txt) - 1 == pos): - return [True, pos - 1] - else: - return [True, pos] - + if(len(txt) == 0): + return [True, pos] + elif(pos > 0 and pos >= len(txt) - 1): + return [True, pos - 1] + # if string contains lowercase letters if('a-z' in exp): if(txt[pos] >= 'a' and txt[pos] <= 'z'): @@ -43,13 +46,15 @@ def match_set(exp, txt, pos = 0, end = 0): if(end == len(txt)): return [True, end] + # remove parenthesis and split by "|" arr = exp.replace('(', '').replace(')', '').split('|') char = txt[end] - + # check if characters in txt is in the expression if any(char in s for s in arr): [matched, end] = match_set(exp, txt, pos, end + 1) + # validate if the matched string is exact the same if(matched): for item in arr: if(item == txt[pos:end]): @@ -69,16 +74,24 @@ def match_exp(exp, txt, txt_pos = 0, exp_pos = 0): if (len(exp) == 0): return [True, txt_pos] + # if to check the starting character + if (is_start(exp[0]) and txt_pos == 0): + return match_exp(exp[1:], txt[0]) + elif (is_start(exp[0]) and txt_pos != 0): + return [False, txt_pos] + # if match set of characters if(is_start_set(exp[exp_pos])): [set_exp, exp_pos] = extract_set(exp[exp_pos], exp) + # if the expression start is a range if(exp[0] == '['): [matched, txt_pos] = match_range(set_exp, txt, txt_pos) - + if(matched): return [True, txt_pos] + # if the expression start is a set elif(exp[0] == '('): [matched, txt_pos] = match_set(set_exp, txt, txt_pos, txt_pos) @@ -96,28 +109,27 @@ def match_exp(exp, txt, txt_pos = 0, exp_pos = 0): # if nothing matches return [False, txt_pos] -# if valid to start -def is_valid(exp, txt): - return len(txt) >= len(exp) or is_start_set(exp[0]) - # start matching def init_match(exp, txt): matched_count = 0 txt_pos = 0 - # if the text length is greater than the expression proceed - if (is_valid(exp, txt)): - # naive algorithm - while txt_pos < len(txt) - 1: - [matched, txt_pos] = match_exp(exp, txt, txt_pos) - - if (matched): - matched_count += 1 - - elif(is_start_set(exp[0])): - break - - txt_pos += 1 + # naive algorithm + while txt_pos < len(txt) - 1: + [matched, txt_pos] = match_exp(exp, txt, txt_pos) + + # is matched increase the count + if (matched): + matched_count += 1 + # if not matched by the set pattern then end + elif(is_start_set(exp[0])): + break + # if not matched by the first character then end + elif(is_start(exp[0])): + break + + # increment the text pointer position + txt_pos += 1 # if the matched count is greater than zero if (matched_count > 0): diff --git a/tests.py b/tests.py index 14fe91a..187a52c 100644 --- a/tests.py +++ b/tests.py @@ -5,9 +5,10 @@ def test(num, exp, txt): re = RegEx(exp) result = re.match(txt) + print(f"=== Test {num} === ") print("Pattern: ", exp) print("String: ", txt) - print(f"Test {num} result: ", result[0]) + print("Result: ", result[0]) print(f"Match count: ", result[1], end='\n\n') @@ -42,6 +43,18 @@ def test(num, exp, txt): 'pattern': '[A-Z]', 'string': 'Hello', }, + { + 'pattern': '^[0-9]', + 'string': 'Hello', + }, + { + 'pattern': '^[0-9]', + 'string': '7Hello', + }, + { + 'pattern': '^H', + 'string': 'Hello', + }, ] # run all test cases