Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pandas_profiling still gives index error even I reduce the dataframe size #560

Open
bi2017dg opened this issue Aug 28, 2020 · 5 comments
Open
Labels
information requested ❔ Cannot reproduce, waiting for minimum reproduction details.

Comments

@bi2017dg
Copy link

IndexError Traceback (most recent call last)
in
19 #ProfileReport(df_s[:10000])
20 profile = df[:1000].profile_report(title='LATE FEE & SUSPENSION Profiling Report', html={'style':{'full_width':True}})
---> 21 profile.to_file("output.html")
22 #(title='LATE FEE & SUSPENSION Profiling Report', html={'style':{'full_width':True}})
23 #profile.to_file(output_file="data profile.html")

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
243 silent: if False, opens the file in the default browser or download it in a Google Colab environment
244 """
--> 245 if not isinstance(output_file, Path):
246 output_file = Path(str(output_file))
247

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
346 with tqdm(total=1, desc="Render JSON", disable=disable_progress_bar) as pbar:
347 data = json.dumps(description, indent=4, cls=CustomEncoder)
--> 348 pbar.update()
349 return data
350

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
166 if self._df_hash == -1 and self.df is not None:
167 self._df_hash = hash_dataframe(self.df)
--> 168 return self._df_hash
169
170 @Property

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
273 if not silent:
274 try:
--> 275 from google.colab import files
276
277 files.download(output_file.absolute().as_uri())

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
160 self._title = config["title"].get(str)
161
--> 162 return self._title
163
164 @Property

~\Anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
141 self._report = None
142 self._html = None
--> 143 self._widgets = None
144 self._json = None
145

~\Anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df)
61 number_of_tasks = 9 + len(df.columns) + len(correlation_names)
62
---> 63 with tqdm(
64 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
65 ) as pbar:

~\Anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, pbar)
471 def get_series_description(series):
472 return describe_1d(series)
--> 473
474
475 def get_series_descriptions(df, pbar):

~\Anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
746 if success:
747 return value
--> 748 raise value
749
750 next = next # XXX

~\Anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:

~\Anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
448 Variable.TYPE_URL: describe_url_1d,
449 Variable.TYPE_PATH: describe_path_1d,
--> 450 Variable.TYPE_IMAGE: describe_image_1d,
451 Variable.TYPE_FILE: describe_file_1d,
452 }

~\Anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series)
417 series: The Series to describe.
418 series_description: The dict containing the series description so far.
--> 419
420 Returns:
421 A dict containing calculated series description values.

~\Anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_date_1d(series, series_description)
231
232 stats["monotonic_increase"] = series.is_monotonic_increasing
--> 233 stats["monotonic_decrease"] = series.is_monotonic_decreasing
234
235 stats["monotonic_increase_strict"] = (

<array_function internals> in histogram(*args, **kwargs)

~\Anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
857 # The index computation is not guaranteed to give exactly
858 # consistent results within ~1 ULP of the bin edges.
--> 859 decrement = tmp_a < bin_edges[indices]
860 indices[decrement] -= 1
861 # The last bin includes the right edge. The other bins do not.

IndexError: index -9223372036854775808 is out of bounds for axis 0 with size 2

@sbrugman
Copy link
Collaborator

sbrugman commented Sep 2, 2020

Could you provide the minimal information to reproduce this error? This guide can help crafting a minimal bug report.

  • the minimal code you are using to generate the report

  • which environment you are using:

    • operating system (e.g. Windows, Linux, Mac)
    • Python version (e.g. 3.7)
    • jupyter notebook, console or IDE such as PyCharm
    • Package manager (e.g. pip, conda conda info)
    • packages (pip freeze > packages.txt or conda list)
  • a sample or description of the dataset (df.head(), df.info())

@sbrugman sbrugman added the information requested ❔ Cannot reproduce, waiting for minimum reproduction details. label Sep 2, 2020
@mike11339
Copy link

I got the same issue as shown below. How did you solve it?


IndexError Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\IPython\core\formatters.py in call(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in repr_html(self)
395 def repr_html(self):
396 """The ipython notebook widgets user interface gets called by the jupyter notebook."""
--> 397 self.to_notebook_iframe()
398
399 def repr(self):

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in to_notebook_iframe(self)
375 with warnings.catch_warnings():
376 warnings.simplefilter("ignore")
--> 377 display(get_notebook_iframe(self))
378
379 def to_widgets(self):

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\report\presentation\flavours\widget\notebook.py in get_notebook_iframe(profile)
63 output = get_notebook_iframe_src(profile)
64 elif attribute == "srcdoc":
---> 65 output = get_notebook_iframe_srcdoc(profile)
66 else:
67 raise ValueError(

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\report\presentation\flavours\widget\notebook.py in get_notebook_iframe_srcdoc(profile)
21 width = config["notebook"]["iframe"]["width"].get(str)
22 height = config["notebook"]["iframe"]["height"].get(str)
---> 23 src = html.escape(profile.to_html())
24
25 iframe = f'<iframe width="{width}" height="{height}" srcdoc="{src}" frameborder="0" allowfullscreen></iframe>'

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
346
347 """
--> 348 return self.html
349
350 def to_json(self) -> str:

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in html(self)
166 def html(self):
167 if self._html is None:
--> 168 self._html = self._render_html()
169 return self._html
170

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
273 from pandas_profiling.report.presentation.flavours import HTMLReport
274
--> 275 report = self.report
276
277 disable_progress_bar = not config["progress_bar"].get(bool)

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in report(self)
160 def report(self):
161 if self._report is None:
--> 162 self._report = get_report_structure(self.description_set)
163 return self._report
164

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
141 def description_set(self):
142 if self._description_set is None:
--> 143 self._description_set = describe_df(self.title, self.df)
144 return self._description_set
145

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df)
61 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
62 ) as pbar:
---> 63 series_description = get_series_descriptions(df, pbar)
64
65 pbar.set_postfix_str("Get variable types")

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, pbar)
470 # TODO: use Pool for Linux-based systems
471 with multiprocessing.pool.ThreadPool(pool_size) as executor:
--> 472 for i, (column, description) in enumerate(
473 executor.imap_unordered(multiprocess_1d, args)
474 ):

C:\ProgramData\Anaconda3\envs\data_analysis\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 next = next # XXX

C:\ProgramData\Anaconda3\envs\data_analysis\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
448 """
449 column, series = args
--> 450 return column, describe_1d(series)
451
452 # Multiprocessing of Describe 1D for each column

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series)
417 if series_description["type"] in type_to_func:
418 series_description.update(
--> 419 type_to_func[series_description["type"]](series, series_description)
420 )
421 else:

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\pandas_profiling\model\summary.py in describe_date_1d(series, series_description)
230 )
231 if chi_squared_threshold > 0.0:
--> 232 histogram = np.histogram(
233 series[series.notna()].astype("int64").values, bins="auto"
234 )[0]

<array_function internals> in histogram(*args, **kwargs)

C:\ProgramData\Anaconda3\envs\data_analysis\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
854 # The index computation is not guaranteed to give exactly
855 # consistent results within ~1 ULP of the bin edges.
--> 856 decrement = tmp_a < bin_edges[indices]
857 indices[decrement] -= 1
858 # The last bin includes the right edge. The other bins do not.

IndexError: index -9223372036854775808 is out of bounds for axis 0 with size 2

@bi2017dg
Copy link
Author

bi2017dg commented Sep 25, 2020 via email

@snehalvartak
Copy link

@bi2017dg how was this resolved? I am facing the same issue.

@zhoujianch
Copy link

zhoujianch commented Apr 18, 2023

I got the same issue as shown below. How did you solve it?
pandas=1.5.0
ydata-profiling=4.1.1

In [2]: a
Out[2]:
{'COUNT_READ_ONLY': [5.0],
'EVENT_NAME': ['transaction'],
'AVG_TIMER_READ_ONLY': [438577000.0],
'SUM_TIMER_READ_ONLY': [2192887000.0],
'SUM_TIMER_READ_WRITE': [1.0950042604231e+16],
'MIN_TIMER_READ_ONLY': [104499000.0],
'MIN_TIMER_READ_WRITE': [8308000.0],
'SUM_TIMER_WAIT': [1.0950044797118e+16],
'AVG_TIMER_WAIT': [1860772000.0],
'COUNT_READ_WRITE': [5884671.0],
'MAX_TIMER_READ_ONLY': [836559000.0],
'MAX_TIMER_WAIT': [603689471745000.0],
'MAX_TIMER_READ_WRITE': [603689471745000.0],
'AVG_TIMER_READ_WRITE': [1860773000.0],
'COUNT_STAR': [5884676.0],
'MIN_TIMER_WAIT': [8308000.0]}

In [3]: import pandas as pd

In [4]: import numpy as np

In [5]: from ydata_profiling import ProfileReport

In [6]: table = pd.DataFrame.from_dict(a)

In [7]: profile_report = ProfileReport(
...: table,
...: progress_bar=False,
...: infer_dtypes=False,
...: missing_diagrams=None,
...: correlations=None,
...: interactions=None,
...: # duplicates=None,
...: samples=None)
In [8]: profile_report.get_description()
/root/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/core/_methods.py:234: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/root/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/lib/histograms.py:822: RuntimeWarning: divide by zero encountered in double_scalars
norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
/root/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/lib/histograms.py:850: RuntimeWarning: invalid value encountered in multiply
f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
/root/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3622: RuntimeWarning: Degrees of freedom <= 0 for slice
**kwargs)
/root/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/core/_methods.py:226: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)

IndexError Traceback (most recent call last)
in
----> 1 profile_report.get_description()

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/typeguard/init.py in wrapper(*args, **kwargs)
1031 memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs)
1032 check_argument_types(memo)
-> 1033 retval = func(*args, **kwargs)
1034 try:
1035 check_return_type(retval, memo)

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/profile_report.py in get_description(self)
315 Dict containing a description for each variable in the DataFrame.
316 """
--> 317 return self.description_set
318
319 def get_rejected_variables(self) -> set:

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/typeguard/init.py in wrapper(*args, **kwargs)
1031 memo = _CallMemo(python_func, _localns, args=args, kwargs=kwargs)
1032 check_argument_types(memo)
-> 1033 retval = func(*args, **kwargs)
1034 try:
1035 check_return_type(retval, memo)

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/profile_report.py in description_set(self)
251 self.summarizer,
252 self.typeset,
--> 253 self._sample,
254 )
255 return self._description_set

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/describe.py in describe(config, df, summarizer, typeset, sample)
70 pbar.total += len(df.columns)
71 series_description = get_series_descriptions(
---> 72 config, df, summarizer, typeset, pbar
73 )
74

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/multimethod/init.py in call(self, *args, **kwargs)
313 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
314 try:
--> 315 return func(*args, **kwargs)
316 except TypeError as ex:
317 raise DispatchError(f"Function {func.code}") from ex

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/pandas/summary_pandas.py in pandas_get_series_descriptions(config, df, summarizer, typeset, pbar)
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
99 for i, (column, description) in enumerate(
--> 100 executor.imap_unordered(multiprocess_1d, args)
101 ):
102 pbar.set_postfix_str(f"Describe variable:{column}")

~/anaconda3/envs/py3.7/lib/python3.7/multiprocessing/pool.py in next(self, timeout)
746 if success:
747 return value
--> 748 raise value
749
750 next = next # XXX

~/anaconda3/envs/py3.7/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/pandas/summary_pandas.py in multiprocess_1d(args)
77 """
78 column, series = args
---> 79 return column, describe_1d(config, series, summarizer, typeset)
80
81 pool_size = config.pool_size

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/multimethod/init.py in call(self, *args, **kwargs)
313 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
314 try:
--> 315 return func(*args, **kwargs)
316 except TypeError as ex:
317 raise DispatchError(f"Function {func.code}") from ex

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/pandas/summary_pandas.py in pandas_describe_1d(config, series, summarizer, typeset)
55
56 typeset.type_schema[series.name] = vtype
---> 57 return summarizer.summarize(config, series, dtype=vtype)
58
59

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/summarizer.py in summarize(self, config, series, dtype)
37 object:
38 """
---> 39 _, _, summary = self.handle(str(dtype), config, series, {"type": str(dtype)})
40 return summary
41

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/handler.py in handle(self, dtype, *args, **kwargs)
60 funcs = self.mapping.get(dtype, [])
61 op = compose(funcs)
---> 62 return op(*args)
63
64

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/handler.py in func2(*x)
19 return f(*x)
20 else:
---> 21 return f(*res)
22
23 return func2

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/handler.py in func2(*x)
15 def func(f: Callable, g: Callable) -> Callable:
16 def func2(*x) -> Any:
---> 17 res = g(*x)
18 if type(res) == bool:
19 return f(*x)

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/multimethod/init.py in call(self, *args, **kwargs)
313 func = self[tuple(func(arg) for func, arg in zip(self.type_checkers, args))]
314 try:
--> 315 return func(*args, **kwargs)
316 except TypeError as ex:
317 raise DispatchError(f"Function {func.code}") from ex

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/summary_algorithms.py in inner(config, series, summary)
63 if not summary["hashable"]:
64 return config, series, summary
---> 65 return fn(config, series, summary)
66
67 return inner

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/summary_algorithms.py in inner(config, series, summary)
80 series = series.dropna()
81
---> 82 return fn(config, series, summary)
83
84 return inner

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/pandas/describe_numeric_pandas.py in pandas_describe_numeric_1d(config, series, summary)
118
119 if chi_squared_threshold > 0.0:
--> 120 stats["chi_squared"] = chi_square(finite_values)
121
122 stats["range"] = stats["max"] - stats["min"]

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/ydata_profiling/model/summary_algorithms.py in chi_square(values, histogram)
50 ) -> dict:
51 if histogram is None:
---> 52 histogram, _ = np.histogram(values, bins="auto")
53 return dict(chisquare(histogram)._asdict())
54

<array_function internals> in histogram(*args, **kwargs)

~/anaconda3/envs/py3.7/lib/python3.7/site-packages/numpy/lib/histograms.py in histogram(a, bins, range, normed, weights, density)
854 # The index computation is not guaranteed to give exactly
855 # consistent results within ~1 ULP of the bin edges.
--> 856 decrement = tmp_a < bin_edges[indices]
857 indices[decrement] -= 1
858 # The last bin includes the right edge. The other bins do not.

IndexError: index -9223372036854775808 is out of bounds for axis 0 with size 2

It can not handle float32 data type?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
information requested ❔ Cannot reproduce, waiting for minimum reproduction details.
Projects
None yet
Development

No branches or pull requests

5 participants