Skip to content

Commit

Permalink
20230405b Pickle dump, CountVectorizer, fix canonicizers
Browse files Browse the repository at this point in the history
Added option to save exp results as Pickle;
Switched to sklearn's CountVectorizer for "Frequency"
  • Loading branch information
Michaeljfang committed Apr 5, 2023
1 parent 949624d commit e733a59
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# PyGAAP Constants
version = "1.0.0 alpha 2"
versiondate = "2023.03.22"
versiondate = "2023.04.05"
41 changes: 23 additions & 18 deletions backend/GUI/GUI2.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from sys import platform
from json import load as json_load
from json import dump as json_dump
from pickle import dump as pickle_dump
from os import listdir as ls
from time import sleep
from pathlib import Path
Expand Down Expand Up @@ -328,25 +329,20 @@ def display_results(self):

results_export_json = Button(
export_buttons_frame, text="Save as json",
command = lambda file_types=(("JSON", "*.json"), ("Text File", "*.txt"), ("All Files", "*.*")),
title="Save experiment results as json":
json_dump(exp_return["full_exp_dump"], open(asksaveasfilename(
filetypes = file_types,
title = title
), "w+"), indent=4)
command=lambda file_types=(("JSON", "*.json"), ("Text File", "*.txt"), ("All Files", "*.*")),
title="Save experiment results as json", results=exp_return["full_exp_dump"]:
self.export_exp_results(results, file_types, title, "json")
)
results_export_json.pack(side=RIGHT)

# results_export_text = Button(
# export_buttons_frame, text="Save as text",
# command = lambda file_types=(("Text File", "*.txt"), ("All Files", "*.*")),
# title="Save results as text":
# json_dump(exp_return["results_text"], open(asksaveasfilename(
# filetypes = file_types,
# title = title
# ), "w+"), indent=4)
# )
# results_export_text.pack(side=RIGHT)

results_export_pickle = Button(
export_buttons_frame, text="Save as serialized Python object",
command=lambda file_types=(("Pickle", "*.pkl"), ("All Files", "*.*")),
title="Save experiment results as serialized Python object", results=exp_return["full_exp_dump"]:
self.export_exp_results(results, file_types, title, "pkl")
)
results_export_pickle.pack(side=RIGHT)

self.results_window.geometry(self.dpi_setting["dpi_process_window_geometry_finished"])
if exp_return["message"].strip() == "":
Expand All @@ -357,12 +353,21 @@ def display_results(self):
self.change_style(self.results_window)
return

def export_exp_results(self, results, file_types, title):
def export_exp_results(self, results, file_types, title, file_type):
save_to = asksaveasfilename(
filetypes = file_types,
title = title
)

if len(save_to) <= 0: return
if file_type == "json":
save_to_file = open(save_to, "w+")
json_dump(results, save_to_file, indent=4)
elif file_type == "pkl":
save_to_file = open(save_to, "wb")
pickle_dump(results, save_to_file)
else:
raise ValueError("Unknown file type")
save_to_file.close()

def process_check(
self,
Expand Down
6 changes: 3 additions & 3 deletions generics/Canonicizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def displayDescription():
class StripPunctuation(Canonicizer):
full_width = 1
_variable_options = {
"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
}
_punct = re.compile(",.?!\"'`;:-()&$")
_fw_punct = re.compile(",。?!“”‘’;:——()、《》【】『』")
Expand Down Expand Up @@ -175,7 +175,7 @@ def displayName():
class PunctuationSeparator(Canonicizer):
full_width = 1
_variable_options = {
"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
}
_punct = ",.?!\"'`;:-()&$"
_fw_punct = ",。?!“”‘’;:——()、《》【】『』"
Expand All @@ -194,7 +194,7 @@ def displayName():
class StripAlphanumeric(Canonicizer):
full_width = 1
_variable_options = {
"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
}
_punct = ",.?!\"'`;:-()&$"
_fw_punct = ",。?!“”‘’;:——()、《》【】『』"
Expand Down
66 changes: 53 additions & 13 deletions generics/modules/nc_0.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,77 @@
from generics.Embedding import Embedding
from backend.Histograms import generateAbsoluteHistogram as gh
from backend import PrepareNumbers as pn
# from backend.Histograms import generateAbsoluteHistogram as gh
# from backend import PrepareNumbers as pn
from multiprocessing import Pool, cpu_count
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

class Frequency(Embedding):
# class Frequency(Embedding):

# normalization = "linear scale [0, 1]"
# _default_multiprocessing = False
# _variable_options = {
# "normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1,
# "displayed_name": "Normalization"}
# }

# def convert(self, docs, pipe=None):
# """Convert and assign to Documents.numbers"""
# if self._default_multiprocessing:
# with Pool(cpu_count()-1) as p:
# raw_frequency = p.map(gh, docs)
# else:
# raw_frequency = [gh(d) for d in docs]
# numbers = pn.dicts_to_array(raw_frequency)
# if self.normalization == "none": pass
# elif self.normalization == "linear scale [0, 1]":
# numbers = numbers/np.max(numbers, axis=1, keepdims=1)
# for d_index in range(len(docs)):
# docs[d_index].numbers = numbers[d_index:d_index+1,:][0]
# return numbers

# def displayDescription():
# return ("Converts events to their frequencies.\n" +\
# "linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\
# "If a doc's features are all zeros, normalization may result in NaNs.")

# def displayName():
# return "Frequency"


class Frequency(Embedding):
normalization = "linear scale [0, 1]"
max_features = 0
binary = 0
_default_multiprocessing = False
_variable_options = {
"normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1,
"displayed_name": "Normalization"}
"displayed_name": "Normalization"},
"max_features": {"options": range(0, 101), "type": "Slider", "default": 0, "displayed_name": "Max features"},
"binary": {"options": [0, 1], "type": "Tick", "default": 0, "displayed_name": "Binary"}
}

def convert(self, docs, pipe=None):
"""Convert and assign to Documents.numbers"""
if self._default_multiprocessing:
with Pool(cpu_count()-1) as p:
raw_frequency = p.map(gh, docs)
else:
raw_frequency = [gh(d) for d in docs]
numbers = pn.dicts_to_array(raw_frequency)

mf = self.max_features if self.max_features > 0 else None
bi = True if self.binary else False
cv = CountVectorizer(lowercase=False, analyzer=lambda x:x, max_features=mf, binary=bi)
numbers = cv.fit_transform([d.eventSet for d in docs]).toarray()

if self.normalization == "none": pass
elif self.normalization == "linear scale [0, 1]":
numbers = numbers/np.max(numbers, axis=1, keepdims=1)
numbers = numbers / np.max(numbers, axis=1, keepdims=1)
for d_index in range(len(docs)):
docs[d_index].numbers = numbers[d_index:d_index+1,:][0]
return numbers

def displayDescription():
return ("Converts events to their frequencies.\n" +\
return (
"Converts events to their frequencies, using sklearn's count vectorizer\n" +\
"linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\
"If a doc's features are all zeros, normalization may result in NaNs.")
"Max features: only tally top n tokens by raw counts. If zero, tally all.\n"+\
"binary: use 0, 1 for token presence/absence instead of counting frequencies."
)

def displayName():
return "Frequency"

0 comments on commit e733a59

Please sign in to comment.