Skip to content

Commit

Permalink
modified summarizer (#704)
Browse files Browse the repository at this point in the history
feat: enable setting of typeset/summarizer
refactor: summarizer extends handler
  • Loading branch information
ieaves authored and sbrugman committed Mar 1, 2021
1 parent aca523e commit 18b05e4
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 90 deletions.
149 changes: 80 additions & 69 deletions src/pandas_profiling/model/handler.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,80 @@
from functools import reduce
from typing import Type

import networkx as nx
from visions import VisionsBaseType

from pandas_profiling.model import typeset as ppt


def compose(functions):
"""
Compose a sequence of functions
:param functions: sequence of functions
:return: combined functions, e.g. [f(x), g(x)] -> g(f(x))
"""

def func(f, g):
def func2(*x):
res = g(*x)
if type(res) == bool:
return f(*x)
else:
return f(*res)

return func2

return reduce(func, reversed(functions), lambda *x: x)


class Handler:
def __init__(self, mapping, typeset, *args, **kwargs):
self.mapping = mapping
self.typeset = typeset

self._complete_dag()

def _complete_dag(self):
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.mapping[to_type] = self.mapping[from_type] + self.mapping[to_type]

def handle(self, dtype: Type[VisionsBaseType], *args, **kwargs) -> dict:
"""
Returns:
object:
"""
op = compose(self.mapping.get(dtype, []))
return op(*args)


def get_render_map():
import pandas_profiling.report.structure.variables as render_algorithms

render_map = {
ppt.Boolean: render_algorithms.render_boolean,
ppt.Numeric: render_algorithms.render_real,
ppt.Complex: render_algorithms.render_complex,
ppt.DateTime: render_algorithms.render_date,
ppt.Categorical: render_algorithms.render_categorical,
ppt.URL: render_algorithms.render_url,
ppt.Path: render_algorithms.render_path,
ppt.File: render_algorithms.render_file,
ppt.Image: render_algorithms.render_image,
ppt.Unsupported: render_algorithms.render_generic,
}

return render_map
from functools import reduce
from typing import Callable, Dict, List, Type

import networkx as nx
from visions import VisionsBaseType, VisionsTypeset

from pandas_profiling.model import typeset as ppt


def compose(functions):
"""
Compose a sequence of functions
:param functions: sequence of functions
:return: combined functions, e.g. [f(x), g(x)] -> g(f(x))
"""

def func(f, g):
def func2(*x):
res = g(*x)
if type(res) == bool:
return f(*x)
else:
return f(*res)

return func2

return reduce(func, reversed(functions), lambda *x: x)


class Handler:
"""A generic handler
Allows any custom mapping between data types and functions
"""

def __init__(
self,
mapping: Dict[Type[VisionsBaseType], List[Callable]],
typeset: VisionsTypeset,
*args,
**kwargs
):
self.mapping = mapping
self.typeset = typeset

self._complete_dag()

def _complete_dag(self):
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.mapping[to_type] = self.mapping[from_type] + self.mapping[to_type]

def handle(self, dtype: Type[VisionsBaseType], *args, **kwargs) -> dict:
"""
Returns:
object:
"""
op = compose(self.mapping.get(dtype, []))
return op(*args)


def get_render_map():
import pandas_profiling.report.structure.variables as render_algorithms

render_map = {
ppt.Boolean: render_algorithms.render_boolean,
ppt.Numeric: render_algorithms.render_real,
ppt.Complex: render_algorithms.render_complex,
ppt.DateTime: render_algorithms.render_date,
ppt.Categorical: render_algorithms.render_categorical,
ppt.URL: render_algorithms.render_url,
ppt.Path: render_algorithms.render_path,
ppt.File: render_algorithms.render_file,
ppt.Image: render_algorithms.render_image,
ppt.Unsupported: render_algorithms.render_generic,
}

return render_map
25 changes: 8 additions & 17 deletions src/pandas_profiling/model/summarizer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Type

import networkx as nx
import numpy as np
import pandas as pd
from visions import VisionsBaseType

from pandas_profiling.model.handler import compose
from pandas_profiling.model.handler import Handler
from pandas_profiling.model.summary_algorithms import (
describe_categorical_1d,
describe_counts,
Expand All @@ -31,33 +30,25 @@
)


class BaseSummarizer:
def __init__(self, summary_map, typeset, *args, **kwargs):
self.summary_map = summary_map
self.typeset = typeset
class BaseSummarizer(Handler):
"""A base summarizer
self._complete_summaries()

def _complete_summaries(self):
for from_type, to_type in nx.topological_sort(
nx.line_graph(self.typeset.base_graph)
):
self.summary_map[to_type] = (
self.summary_map[from_type] + self.summary_map[to_type]
)
Can be used to define custom summarizations
"""

def summarize(self, series: pd.Series, dtype: Type[VisionsBaseType]) -> dict:
"""
Returns:
object:
"""
summarizer_func = compose(self.summary_map.get(dtype, []))
_, summary = summarizer_func(series, {"type": dtype})
_, summary = self.handle(dtype, series, {"type": dtype})
return summary


class PandasProfilingSummarizer(BaseSummarizer):
"""The default Pandas Profiling summarizer"""

def __init__(self, typeset, *args, **kwargs):
summary_map = {
Unsupported: [
Expand Down
17 changes: 13 additions & 4 deletions src/pandas_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from visions import VisionsTypeset

from pandas_profiling.config import config
from pandas_profiling.expectations_report import ExpectationsReport
from pandas_profiling.model.describe import describe as describe_df
from pandas_profiling.model.messages import MessageType
from pandas_profiling.model.sample import Sample
from pandas_profiling.model.summarizer import PandasProfilingSummarizer, format_summary
from pandas_profiling.model.summarizer import (
BaseSummarizer,
PandasProfilingSummarizer,
format_summary,
)
from pandas_profiling.model.typeset import ProfilingTypeSet
from pandas_profiling.report import get_report_structure
from pandas_profiling.report.presentation.flavours.html.templates import (
Expand All @@ -27,7 +32,7 @@
class ProfileReport(SerializeReport, ExpectationsReport):
"""Generate a profile report from a Dataset stored as a pandas `DataFrame`.
Used has is it will output its content as an HTML report in a Jupyter notebook.
Used as is, it will output its content as an HTML report in a Jupyter notebook.
"""

def __init__(
Expand All @@ -41,6 +46,8 @@ def __init__(
sample: Optional[dict] = None,
config_file: Union[Path, str] = None,
lazy: bool = True,
typeset: Optional[VisionsTypeset] = None,
summarizer: Optional[BaseSummarizer] = None,
**kwargs,
):
"""Generate a ProfileReport based on a pandas DataFrame
Expand All @@ -51,6 +58,8 @@ def __init__(
config_file: a config file (.yml), mutually exclusive with `minimal`
lazy: compute when needed
sample: optional dict(name="Sample title", caption="Caption", data=pd.DataFrame())
typeset: optional user typeset to use for type inference
summarizer: optional user summarizer to generate custom summary output
**kwargs: other arguments, for valid arguments, check the default configuration file.
"""
config.clear() # to reset (previous) config.
Expand Down Expand Up @@ -92,8 +101,8 @@ def __init__(
self._html = None
self._widgets = None
self._json = None
self._typeset = None
self._summarizer = None
self._typeset = typeset
self._summarizer = summarizer

if df is not None:
# preprocess df
Expand Down

0 comments on commit 18b05e4

Please sign in to comment.