-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfuzzy-search.py
126 lines (106 loc) · 4.6 KB
/
fuzzy-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import csv
import sys
import time
import os
import pandas as pd
from thefuzz import fuzz
from tqdm import tqdm
def sort_large_csv_pandas(input_file, output_file, chunksize=100000):
"""
Sorts a large CSV file using Pandas.
Args:
input_file: Path to the input CSV file.
output_file: Path to the output CSV file.
chunksize: Number of rows to process in each chunk.
"""
df_list = []
for chunk in pd.read_csv(input_file, chunksize=chunksize):
df_list.append(chunk.sort_values(by=['message'])) # Replace 'column_to_sort' with the actual column name
df = pd.concat(df_list)
df.sort_values(by=['message'], inplace=True)
df.to_csv(output_file, index=False)
def convert_to_csv(similarity_data_list, output_file):
with open(output_file, 'w', newline='') as csvfile:
# fieldnames = ['Matcher', 'Matches', 'Scores', 'Count']
fieldnames = ['Matcher', 'Count']
writer = csv.writer(csvfile)
writer.writerow(fieldnames)
for data in similarity_data_list:
# writer.writerow([data.matcher, data.matches, data.scores, data.count])
writer.writerow([data.matcher, data.count])
# for match, score in zip(data.matches, data.scores):
# writer.writerow([data.matcher, match, score, data.count])
class SimilarityData:
def __init__(self, matcher: str, matches: list, scores: list, count: int):
self.matcher = matcher # String
self.matches = matches # List of strings
self.scores = scores # List of integers
self.count = count # Integer
def update_matches_and_scores(self, new_match, new_score):
if not self.matches:
# Initialize an empty list if not already present
self.matches = []
self.scores = []
self.matches.append(new_match)
self.scores.append(new_score)
self.count += 1
def __str__(self):
return f"Matcher: {self.matcher}, Matches: {self.matches}, Scores: {self.scores}, Count: {self.count}"
# Process CSV in chunks
def process_large_csv(file_path, output_file):
# Adjust based on your memory capacity
chunk_size = 10
sleep_time = 0
required_score = 50
similarity_data_list = []
skipped = 0
current_message = 0
for chunk in tqdm(pd.read_csv(file_path, chunksize=chunk_size), total=total_messages / chunk_size):
# for chunk in pd.read_csv(file_path, chunksize=chunk_size):
messages = chunk['message'].tolist()
for message in messages:
current_message += 1
if len(message) > 10000:
skipped += 1
continue
# print(message)
# print(len(message))
time.sleep(sleep_time)
# Now, find the similarity score of this message with existing messages
matcher_found = False
for index, data in enumerate(similarity_data_list):
score = fuzz.ratio(data.matcher, message)
if score >= required_score:
similarity_data_list[index].update_matches_and_scores(message, score)
matcher_found = True
break
if not matcher_found:
similarity_data_list.append(SimilarityData(message, [], [], 0))
# print(current_message)
if current_message >= total_messages:
break
print("Skipped: " + str(skipped))
# Concatenate all results into a single DataFrame
similarity_data_list.sort(key=lambda x: x.count, reverse=True)
convert_to_csv(similarity_data_list, output_file)
# Specify file paths
base_path = '/Users/vaibhav.malik/Downloads/'
input_file_name = 'All-Messages-search-result.csv'
input_file_path = base_path + input_file_name
total_messages = 10000
if len(sys.argv) == 1:
print("Input filename was not specified, defaulting to " + input_file_path)
print("Total messages to scan were not specified, defaulting to " + str(total_messages) + "messages")
else:
input_file_path = sys.argv[1]
input_file_name = os.path.basename(input_file_path)
total_messages = int(sys.argv[2])
print("Input file: " + input_file_path)
print("Total messages to be scanned: " + str(total_messages))
# Process the large CSV file
# sorted_input_file_path = base_path + "Sorted-" + input_file_name
# sort_large_csv_pandas(input_file_path, sorted_input_file_path)
output_file_path = base_path + "Analysis-" + input_file_name
process_large_csv(input_file_path, output_file_path)
# process_large_csv(sorted_input_file_path, output_file_path)
print("Similarity data saved to: " + output_file_path)