forked from DavidBerdik/PyGAAP
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
20230405b Pickle dump, CountVectorizer, fix canonicizers
Added option to save exp results as Pickle; Switched to sklearn's CountVectorizer for "Frequency"
- Loading branch information
1 parent
949624d
commit e733a59
Showing
4 changed files
with
80 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# PyGAAP Constants | ||
version = "1.0.0 alpha 2" | ||
versiondate = "2023.03.22" | ||
versiondate = "2023.04.05" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,77 @@ | ||
from generics.Embedding import Embedding | ||
from backend.Histograms import generateAbsoluteHistogram as gh | ||
from backend import PrepareNumbers as pn | ||
# from backend.Histograms import generateAbsoluteHistogram as gh | ||
# from backend import PrepareNumbers as pn | ||
from multiprocessing import Pool, cpu_count | ||
import numpy as np | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
|
||
class Frequency(Embedding): | ||
# class Frequency(Embedding): | ||
|
||
# normalization = "linear scale [0, 1]" | ||
# _default_multiprocessing = False | ||
# _variable_options = { | ||
# "normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1, | ||
# "displayed_name": "Normalization"} | ||
# } | ||
|
||
# def convert(self, docs, pipe=None): | ||
# """Convert and assign to Documents.numbers""" | ||
# if self._default_multiprocessing: | ||
# with Pool(cpu_count()-1) as p: | ||
# raw_frequency = p.map(gh, docs) | ||
# else: | ||
# raw_frequency = [gh(d) for d in docs] | ||
# numbers = pn.dicts_to_array(raw_frequency) | ||
# if self.normalization == "none": pass | ||
# elif self.normalization == "linear scale [0, 1]": | ||
# numbers = numbers/np.max(numbers, axis=1, keepdims=1) | ||
# for d_index in range(len(docs)): | ||
# docs[d_index].numbers = numbers[d_index:d_index+1,:][0] | ||
# return numbers | ||
|
||
# def displayDescription(): | ||
# return ("Converts events to their frequencies.\n" +\ | ||
# "linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\ | ||
# "If a doc's features are all zeros, normalization may result in NaNs.") | ||
|
||
# def displayName(): | ||
# return "Frequency" | ||
|
||
|
||
class Frequency(Embedding): | ||
normalization = "linear scale [0, 1]" | ||
max_features = 0 | ||
binary = 0 | ||
_default_multiprocessing = False | ||
_variable_options = { | ||
"normalization": {"options": ["none", "linear scale [0, 1]"], "type": "OptionMenu", "default": 1, | ||
"displayed_name": "Normalization"} | ||
"displayed_name": "Normalization"}, | ||
"max_features": {"options": range(0, 101), "type": "Slider", "default": 0, "displayed_name": "Max features"}, | ||
"binary": {"options": [0, 1], "type": "Tick", "default": 0, "displayed_name": "Binary"} | ||
} | ||
|
||
def convert(self, docs, pipe=None): | ||
"""Convert and assign to Documents.numbers""" | ||
if self._default_multiprocessing: | ||
with Pool(cpu_count()-1) as p: | ||
raw_frequency = p.map(gh, docs) | ||
else: | ||
raw_frequency = [gh(d) for d in docs] | ||
numbers = pn.dicts_to_array(raw_frequency) | ||
|
||
mf = self.max_features if self.max_features > 0 else None | ||
bi = True if self.binary else False | ||
cv = CountVectorizer(lowercase=False, analyzer=lambda x:x, max_features=mf, binary=bi) | ||
numbers = cv.fit_transform([d.eventSet for d in docs]).toarray() | ||
|
||
if self.normalization == "none": pass | ||
elif self.normalization == "linear scale [0, 1]": | ||
numbers = numbers/np.max(numbers, axis=1, keepdims=1) | ||
numbers = numbers / np.max(numbers, axis=1, keepdims=1) | ||
for d_index in range(len(docs)): | ||
docs[d_index].numbers = numbers[d_index:d_index+1,:][0] | ||
return numbers | ||
|
||
def displayDescription(): | ||
return ("Converts events to their frequencies.\n" +\ | ||
return ( | ||
"Converts events to their frequencies, using sklearn's count vectorizer\n" +\ | ||
"linear scale [0, 1] in normalization means scaling values to [0, 1].\n\n" +\ | ||
"If a doc's features are all zeros, normalization may result in NaNs.") | ||
"Max features: only tally top n tokens by raw counts. If zero, tally all.\n"+\ | ||
"binary: use 0, 1 for token presence/absence instead of counting frequencies." | ||
) | ||
|
||
def displayName(): | ||
return "Frequency" |