Skip to content

Commit

Permalink
logo update and refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
meetyildiz committed Sep 8, 2020
1 parent 7860c2e commit 0e33dcd
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 62 deletions.
8 changes: 0 additions & 8 deletions __init__.py

This file was deleted.

Binary file modified logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added pandazip/__init__.py
Empty file.
69 changes: 23 additions & 46 deletions pandazip.py → pandazip/pandazip.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,20 @@
import numpy as np
import pandas as pd
import time
import gc
from joblib import Parallel, delayed


def measure_time_mem(func):
def wrapped_reduce(self, data, *args, **kwargs):
# pre
mem_usage_orig = data.memory_usage().sum() / self.memory_scale_factor
start_time = time.time()
# exec
ret = func(self, data, *args, **kwargs)
# post
mem_usage_new = ret.memory_usage().sum() / self.memory_scale_factor
end_time = time.time()
"""
print(f'reduced data from {mem_usage_orig:.4f} MB '
f'to {mem_usage_new:.4f} MB '
f'in {(end_time - start_time):.2f} seconds')
"""
gc.collect()
return ret

return wrapped_reduce
from pandazip.utils import measure_time_mem


class Pandazip:
"""
Class that takes a dict of increasingly big numpy datatypes to transform
the data of a pandas dataframe into, in order to save memory usage.
Class that gets a Pandas DataFrame and compresses its data to smallest
feasible datatype per column if level="low". If level is "mid" or "high",
number data types are force to 32 and 16 bits respectively.
"""
memory_scale_factor = 1024 ** 2 # memory in MB

def __init__(self, encode_cat=False, n_jobs=-1):
"""
:param conv_table: dict with np.dtypes-strings as keys
:param encode_cat: Whether the new pandas dtype "Categoricals"
shall be used
:param n_jobs: Parallelization rate
"""
conv_table = None
def __init__(self):

self.n_jobs = n_jobs
self.n_jobs = -1

def _type_candidates(self, k):
for c in self.compress_lookup[k]:
Expand All @@ -55,32 +27,36 @@ def zip(self, data, level="low", verbose=False):
smallest necessary types.
:param data: pandas dataframe
:param level: string - "low", "mid", "high"
:param verbose: If True, outputs more information
:return: pandas dataframe with reduced data types
"""
if level == "low":
self.compress_lookup = {'int': [np.int8, np.int16, np.int32, np.int64],
'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
'float': [np.float16, np.float32, np.float64, ]}
self.encode_cat = False
self.pandas_category = False

elif level == "mid":
self.compress_lookup = {'int': [np.int8, np.int16, np.int32],
'uint': [np.uint8, np.uint16, np.uint32],
'float': [np.float16, np.float32]}
self.encode_cat = False
self.pandas_category = True

elif level == "high":
self.compress_lookup = {'int': [np.int8, np.int16],
'uint': [np.uint8, np.uint16],
'float': [np.float16]}
self.encode_cat = True
self.pandas_category = True

else:
print("bad")
print("Bad level")

start_size = round(data.memory_usage().sum() / 1024 ** 2, 2)
print("Starting size is :{} MB".format(start_size))
print("Starting size :{} MB".format(start_size))

for col in data.columns:
data[col] = pd.to_numeric(data[col], errors='ignore')

ret_list = Parallel(n_jobs=self.n_jobs)(delayed(self._reduce)
(data[c], c, verbose) for c in
Expand All @@ -90,10 +66,12 @@ def zip(self, data, level="low", verbose=False):
gc.collect()

reduced_data = pd.concat(ret_list, axis=1)
boolian_cols = reduced_data.select_dtypes("bool").columns
reduced_data[boolian_cols] = reduced_data[boolian_cols].astype(np.uint8)

final_size = round(reduced_data.memory_usage().sum() / 1024 ** 2, 2)
print("Finishing size is :{} MB".format(final_size))
print("Compression rate is {}%".format(round(1 - final_size / start_size, 2)))
print("Finishing size: {} MB".format(final_size))
print("Compression rate: {}%".format(round((1 - final_size / start_size) * 100, 2)))
return reduced_data

def _reduce(self, s, colname, verbose):
Expand All @@ -108,7 +86,7 @@ def _reduce(self, s, colname, verbose):
elif np.issubdtype(coltype, np.floating):
conv_key = 'float'
else:
if isinstance(coltype, object) and self.encode_cat:
if isinstance(coltype, object) and self.pandas_category:
# check for all-strings series
if s.apply(lambda x: isinstance(x, str)).all():
if verbose: print(f'convert {colname} to categorical')
Expand All @@ -121,7 +99,6 @@ def _reduce(self, s, colname, verbose):
if verbose: print(f'convert {colname} to {cand}')
return s.astype(cand)

# reaching this code is bad. Probably there are inf, or other high numbs
print(f"WARNING: {colname} doesn't fit the grid with \nmax: {s.max()} "
f"and \nmin: {s.min()}")
print('Dropping it..')
return s.astype(cand)


22 changes: 22 additions & 0 deletions pandazip/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

def measure_time_mem(func):
def wrapped_reduce(self, data, *args, **kwargs):
# pre
mem_usage_orig = data.memory_usage().sum() / self
.1024 ** 2
start_time = time.time()
# exec
ret = func(self, data, *args, **kwargs)
# post
mem_usage_new = ret.memory_usage().sum() / self
.1024 ** 2
end_time = time.time()
"""
print(f'reduced data from {mem_usage_orig:.4f} MB '
f'to {mem_usage_new:.4f} MB '
f'in {(end_time - start_time):.2f} seconds')
"""
gc.collect()
return ret

return wrapped_reduce
Empty file added requirements.txt
Empty file.
13 changes: 5 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,13 @@
packages=[],
install_requires=['pandas',
'numpy',
'joblib',
],

classifiers=[
'Development Status :: 1 - Planning',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: BSD License',
'Operating System :: POSIX :: Linux',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
"Programming Language :: Python :: 3",
"License :: OSI Approved :: BSD 3-clause",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)

0 comments on commit 0e33dcd

Please sign in to comment.