20230405b Pickle dump, CountVectorizer, fix canonicizers

Added option to save exp results as Pickle; Switched to sklearn's CountVectorizer for "Frequency"
Michaeljfang · Apr 5, 2023 · e733a59 · e733a59
1 parent 949624d
commit e733a59
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 35 deletions.
diff --git a/Constants.py b/Constants.py
@@ -1,3 +1,3 @@
 # PyGAAP Constants
 version = "1.0.0 alpha 2"
-versiondate = "2023.03.22"
+versiondate = "2023.04.05"
diff --git a/backend/GUI/GUI2.py b/backend/GUI/GUI2.py
@@ -28,6 +28,7 @@
 from sys import platform
 from json import load as json_load
 from json import dump as json_dump
+from pickle import dump as pickle_dump
 from os import listdir as ls
 from time import sleep
 from pathlib import Path
@@ -328,25 +329,20 @@ def display_results(self):
 
 		results_export_json = Button(
 			export_buttons_frame, text="Save as json",
-			command = lambda file_types=(("JSON", "*.json"), ("Text File", "*.txt"), ("All Files", "*.*")),
-			title="Save experiment results as json":
-			json_dump(exp_return["full_exp_dump"], open(asksaveasfilename(
-				filetypes = file_types,
-				title = title
-			), "w+"), indent=4)
+			command=lambda file_types=(("JSON", "*.json"), ("Text File", "*.txt"), ("All Files", "*.*")),
+			title="Save experiment results as json", results=exp_return["full_exp_dump"]:
+			self.export_exp_results(results, file_types, title, "json")
 		)
 		results_export_json.pack(side=RIGHT)
 
-		# results_export_text = Button(
-		# 	export_buttons_frame, text="Save as text",
-		# 	command = lambda file_types=(("Text File", "*.txt"), ("All Files", "*.*")),
-		# 	title="Save results as text":
-		# 	json_dump(exp_return["results_text"], open(asksaveasfilename(
-		# 		filetypes = file_types,
-		# 		title = title
-		# 	), "w+"), indent=4)
-		# )
-		# results_export_text.pack(side=RIGHT)
+
+		results_export_pickle = Button(
+			export_buttons_frame, text="Save as serialized Python object",
+			command=lambda file_types=(("Pickle", "*.pkl"), ("All Files", "*.*")),
+			title="Save experiment results as serialized Python object", results=exp_return["full_exp_dump"]:
+			self.export_exp_results(results, file_types, title, "pkl")
+		)
+		results_export_pickle.pack(side=RIGHT)
 
 		self.results_window.geometry(self.dpi_setting["dpi_process_window_geometry_finished"])
 		if exp_return["message"].strip() == "":
@@ -357,12 +353,21 @@ def display_results(self):
 		self.change_style(self.results_window)
 		return
 
-	def export_exp_results(self, results, file_types, title):
+	def export_exp_results(self, results, file_types, title, file_type):
 		save_to = asksaveasfilename(
 			filetypes = file_types,
 			title = title
 		)
-
+		if len(save_to) <= 0: return
+		if file_type == "json":
+			save_to_file = open(save_to, "w+")
+			json_dump(results, save_to_file, indent=4)
+		elif file_type == "pkl":
+			save_to_file = open(save_to, "wb")
+			pickle_dump(results, save_to_file)
+		else:
+			raise ValueError("Unknown file type")
+		save_to_file.close()
 
 	def process_check(
 			self,

diff --git a/generics/Canonicizer.py b/generics/Canonicizer.py
@@ -124,7 +124,7 @@ def displayDescription():
 class StripPunctuation(Canonicizer):
 	full_width = 1
 	_variable_options = {
-		"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
+		"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
 	}
 	_punct = re.compile(",.?!\"'`;:-()&$")
 	_fw_punct = re.compile("，。？！“”‘’；：——（）、《》【】『』")
@@ -175,7 +175,7 @@ def displayName():
 class PunctuationSeparator(Canonicizer):
 	full_width = 1
 	_variable_options = {
-		"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
+		"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
 	}
 	_punct = ",.?!\"'`;:-()&$"
 	_fw_punct = "，。？！“”‘’；：——（）、《》【】『』"
@@ -194,7 +194,7 @@ def displayName():
 class StripAlphanumeric(Canonicizer):
 	full_width = 1
 	_variable_options = {
-		"chn_jpa": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
+		"full_width": {"options": [0, 1], "type": "Tick", "default": 1, "displayed_name": "Include full-width"}
 	}
 	_punct = ",.?!\"'`;:-()&$"
 	_fw_punct = "，。？！“”‘’；：——（）、《》【】『』"

diff --git a/generics/modules/nc_0.py b/generics/modules/nc_0.py
@@ -1,37 +1,77 @@
 from generics.Embedding import Embedding
-from backend.Histograms import generateAbsoluteHistogram as gh
-from backend import PrepareNumbers as pn
+# from backend.Histograms import generateAbsoluteHistogram as gh
+# from backend import PrepareNumbers as pn
 from multiprocessing import Pool, cpu_count
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
 
-class Frequency(Embedding):
+# class Frequency(Embedding):
+
+# 	normalization = "linear scale [0, 1]"
+# 	_default_multiprocessing = False
+# 	_variable_options = {
+# 		"normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1,
+# 		"displayed_name": "Normalization"}
+# 	}
+
+# 	def convert(self, docs, pipe=None):
+# 		"""Convert and assign to Documents.numbers"""
+# 		if self._default_multiprocessing:
+# 			with Pool(cpu_count()-1) as p:
+# 				raw_frequency = p.map(gh, docs)
+# 		else:
+# 			raw_frequency = [gh(d) for d in docs]
+# 		numbers = pn.dicts_to_array(raw_frequency)
+# 		if self.normalization == "none": pass
+# 		elif self.normalization == "linear scale [0, 1]":
+# 			numbers = numbers/np.max(numbers, axis=1, keepdims=1)
+# 		for d_index in range(len(docs)):
+# 			docs[d_index].numbers = numbers[d_index:d_index+1,:][0]
+# 		return numbers
+
+# 	def displayDescription():
+# 		return ("Converts events to their frequencies.\n" +\
+# 			"linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\
+# 			"If a doc's features are all zeros, normalization may result in NaNs.")
+
+# 	def displayName():
+# 		return "Frequency"
 
+
+class Frequency(Embedding):
 	normalization = "linear scale [0, 1]"
+	max_features = 0
+	binary = 0
 	_default_multiprocessing = False
 	_variable_options = {
 		"normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1,
-		"displayed_name": "Normalization"}
+		"displayed_name": "Normalization"},
+		"max_features": {"options": range(0, 101), "type": "Slider", "default": 0, "displayed_name": "Max features"},
+		"binary": {"options": [0, 1], "type": "Tick", "default": 0, "displayed_name": "Binary"}
 	}
 
 	def convert(self, docs, pipe=None):
 		"""Convert and assign to Documents.numbers"""
-		if self._default_multiprocessing:
-			with Pool(cpu_count()-1) as p:
-				raw_frequency = p.map(gh, docs)
-		else:
-			raw_frequency = [gh(d) for d in docs]
-		numbers = pn.dicts_to_array(raw_frequency)
+
+		mf = self.max_features if self.max_features > 0 else None
+		bi = True if self.binary else False
+		cv = CountVectorizer(lowercase=False, analyzer=lambda x:x, max_features=mf, binary=bi)
+		numbers = cv.fit_transform([d.eventSet for d in docs]).toarray()
+
 		if self.normalization == "none": pass
 		elif self.normalization == "linear scale [0, 1]":
-			numbers = numbers/np.max(numbers, axis=1, keepdims=1)
+			numbers = numbers / np.max(numbers, axis=1, keepdims=1)
 		for d_index in range(len(docs)):
 			docs[d_index].numbers = numbers[d_index:d_index+1,:][0]
 		return numbers
 
 	def displayDescription():
-		return ("Converts events to their frequencies.\n" +\
+		return (
+			"Converts events to their frequencies, using sklearn's count vectorizer\n" +\
 			"linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\
-			"If a doc's features are all zeros, normalization may result in NaNs.")
+			"Max features: only tally top n tokens by raw counts. If zero, tally all.\n"+\
+			"binary: use 0, 1 for token presence/absence instead of counting frequencies."
+		)
 
 	def displayName():
 		return "Frequency"