-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnotes_exploration.py
125 lines (101 loc) · 4.26 KB
/
notes_exploration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import pandas as pd
import re
uppercase_words_regex = r'\b[A-Z]+\b'
def list_documents(directory: str) -> list:
"""
Iterates over all files and sub-folders of the input directory,
filters only for ".doc" and "docx" files and returns their filenames in a list.
Parameters:
directory (str): An absolute path to a directory.
Returns:
List[str]: A list of filenames for ".doc" and ".docx" files in the directory and its sub-folders.
"""
doc_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(".docx"): # ".doc"
doc_files.append(file)
return doc_files
def parse_filename(filename: str) -> dict:
"""
Parses a string in the format "<client> <date> pos <pos1> <pos2> <sesions> <keywords>"
and returns a dictionary with the extracted values.
Parameters:
string (str): A string in the specified format.
Returns:
dict: A dictionary with the extracted values.
"""
# Split the string into parts (removing extension)
parts = os.path.splitext(filename)[0].split()
# Extract the values
client = parts[0]
date = parts[1]
pos = ''
sessions = 0
keywords = ''
keywords_start_idx = 4
# Read positions and sessions after pos
if parts[3].isdigit():
pos += parts[3]
if len(parts) > 5 and parts[4].isdigit() and not parts[5].isdigit():
sessions = parts[4]
keywords_start_idx = 5
if len(parts) > 6 and parts[4].isdigit() and parts[5].isdigit():
pos += ',' + parts[4]
sessions = parts[5]
keywords_start_idx = 6
keywords = " ".join(parts[keywords_start_idx:])
if '+' in keywords:
keywords = keywords.replace('+', ',')
keywords = keywords.replace('axel', '')
keywords = keywords.replace('sessions', '')
keywords = keywords.replace('session', '')
# Create and return the dictionary
return {"file": filename, "client": client, "date": date, "position": pos, "sessions": sessions,
"keywords": keywords}
def process_documents(doc_list: list) -> pd.DataFrame:
"""
Process a list of document filenames and return a Pandas DataFrame containing the parsed records.
Parameters:
doc_list (list): A list of document filenames to be parsed.
Returns:
pd.DataFrame: A Pandas DataFrame containing the parsed records, with columns for filename, client initials,
date, position, sessions, and keywords.
Raises:
Exception: If an error occurs during parsing of a filename, an error message is printed to the console.
"""
df = pd.DataFrame(columns=['file', 'client', 'date', 'position', 'sessions', 'keywords'])
for filename in doc_list:
try:
new_record = parse_filename(filename)
df.loc[len(df)] = new_record
except Exception as e:
print(f'Error parsing file: {filename}')
return df
dir = '/Users/apereira/Documents/Upshot/US02 - Pronotez/dataset'
csv_dir = './filenames_parsed.csv'
docs = list_documents(dir)
pos_filter = lambda filename: ' pos ' in filename.lower()
docs_with_pos = list(filter(pos_filter, docs))
df = process_documents(docs_with_pos)
df.to_csv(csv_dir)
# doc_filter = lambda filename: filename.endswith('.doc')
# docx_filter = lambda filename: filename.endswith('.docx')
# plus_filter = lambda filename: '+' in filename
# session_filter = lambda filename: 'session' in filename
# not_session_filter = lambda filename: 'session' not in filename
# format_filename_lambda = lambda f: f.split()
# doc_list = list(filter(doc_filter, docs))
# docx_list = list(filter(docx_filter, docs))
# docs_with_pos = list(filter(pos_filter, docs))
# docs_with_plus = list(filter(plus_filter, docs_with_pos))
# docs_with_sessions = list(filter(session_filter, docs_with_pos))
# docs_not_sessions = list(filter(not_session_filter, docs_with_pos))
# df = pd.read_csv(csv_dir)
# upper_matches = df['keywords'].apply(lambda x: re.findall(r'\b[A-Z]+\b', x)).explode().unique()
# upper_matches = [x for x in upper_matches if pd.notna(x)]
# alfanum_matches = df['keywords'].apply(lambda x: re.findall(r'\b[A-Za-z\d]+-[A-Za-z\d]+\b', x)).explode().unique()
# alfanum_matches = [x for x in alfanum_matches if pd.notna(x)]
# print(upper_matches)
# print(alfanum_matches)