Skip to content

Commit

Permalink
feat: minimal spark deployment (#1132)
Browse files Browse the repository at this point in the history
* Initial commit of spark-branch with all packed logic/tests in semi-working state
  • Loading branch information
chanedwin authored and vascoalramos committed Jan 30, 2023
1 parent a105d32 commit 4113389
Show file tree
Hide file tree
Showing 39 changed files with 1,935 additions and 49 deletions.
87 changes: 87 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,91 @@ jobs:

- run: make test_cov

- uses: actions/cache@v2
if: startsWith(runner.os, 'Windows')
with:
path: ~\AppData\Local\pip\Cache
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-
- run: |
pip install --upgrade pip setuptools wheel
pip install -r requirements.txt "${{ matrix.pandas }}" "${{ matrix.numpy }}"
pip install -r requirements-test.txt
- run: make install
- run: make test_cov
- run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }}


test_spark:
runs-on: ${{ matrix.os }}
continue-on-error: True
strategy:
matrix:
os: [ ubuntu-latest ]
python-version: [3.7, 3.8]
pandas: ["pandas==0.25.3", "pandas==1.0.5", "pandas>1.1"]
spark: ["2.3.0", "2.4.7", "3.0.1"]
hadoop: [ 2.7 ]
numpy: ["numpy==1.19.5"]
java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ]
exclude:
- python-version: 3.8
spark: "2.3.0"
- python-version: 3.8
spark: "2.4.7"
# - os: macos-latest
# python-version: 3.6
# pandas: ">1.1"
# - os: windows-2016
# python-version: 3.6
# pandas: ">1.1"

name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
env:
JAVA_HOME: ${{ matrix.java_home }}
SPARK_VERSION: ${{ matrix.spark }}
HADOOP_VERSION: ${{ matrix.hadoop }}
SPARK_DIRECTORY: ${{ github.workspace }}/../
SPARK_HOME: ${{ github.workspace }}/../spark/
steps:
- uses: actions/checkout@v2
- name: Setup python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
architecture: x64
- uses: actions/cache@v2
if: startsWith(runner.os, 'Linux')
with:
path: ~/.cache/pip
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-\
- uses: actions/cache@v2
if: startsWith(runner.os, 'macOS')
with:
path: ~/Library/Caches/pip
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-
- uses: actions/cache@v2
if: startsWith(runner.os, 'Windows')
with:
path: ~\AppData\Local\pip\Cache
key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
restore-keys: |
${{ runner.os }}-${{ matrix.pandas }}-pip-
- run: |
pip install --upgrade pip setuptools wheel
pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}"
pip install -r requirements.txt
pip install -r requirements-test.txt
pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}"
- if: ${{ matrix.spark != '3.0.1' }}
run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
- run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
- run: make install
- run: make install-spark-ci
- run: make test_spark

3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ recursive-include src/pandas_profiling/report/presentation/flavours/html/templat
# Configuration
include src/pandas_profiling/*.yaml

# Spark Dev venv
recursive-include venv *.yml

# Exclude development, docs, testing and example code
exclude .pre-commit-config.yaml
exclude commitlint.config.js
Expand Down
11 changes: 11 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ test:
pytest --nbval tests/notebooks/
pandas_profiling -h

test_spark:
pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/
pandas_profiling -h

test_cov:
pytest --cov=. tests/unit/
pytest --cov=. --cov-append tests/issues/
Expand All @@ -30,6 +34,13 @@ package:
install:
pip install -e .[notebook]

install-spark-ci:
sudo apt-get update
sudo apt-get -y install openjdk-8-jdk
curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
--output ${SPARK_DIRECTORY}/spark.tgz
cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark

lint:
pre-commit run --all-files

Expand Down
5 changes: 5 additions & 0 deletions requirements-spark.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# this provides the recommended pyspark and pyarrow versions for spark to work on pandas-profiling
# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
pyspark>=2.3.0
pyarrow>=2.0.0
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pytest
coverage~=6.5
codecov
pytest-cov
pytest-spark
nbval
pyarrow
twine>=3.1.1
Expand Down
5 changes: 5 additions & 0 deletions src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.. include:: ../../README.md
"""
import importlib.util

from pandas_profiling.compare_reports import compare
from pandas_profiling.controller import pandas_decorator
Expand All @@ -11,6 +12,10 @@
# backend
import pandas_profiling.model.pandas # isort:skip # noqa

spec = importlib.util.find_spec("pyspark")
if spec is not None:
import pandas_profiling.model.spark # isort:skip # noqa


__all__ = [
"pandas_decorator",
Expand Down
36 changes: 36 additions & 0 deletions src/pandas_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,42 @@ def from_file(config_file: Union[Path, str]) -> "Settings":
return Settings().parse_obj(data)


class SparkSettings(Settings):
# TO-DO write description
vars: Univariate = Univariate()

vars.num.low_categorical_threshold = 0

infer_dtypes = False

correlations: Dict[str, Correlation] = {
"spearman": Correlation(key="spearman"),
"pearson": Correlation(key="pearson"),
"kendall": Correlation(key="kendall"),
"cramers": Correlation(key="cramers"),
"phi_k": Correlation(key="phi_k"),
}
correlations["pearson"].calculate = True
correlations["spearman"].calculate = True
correlations["kendall"].calculate = False
correlations["cramers"].calculate = False
correlations["phi_k"].calculate = False

interactions: Interactions = Interactions()
interactions.continuous = False

missing_diagrams: Dict[str, bool] = {
"bar": False,
"matrix": False,
"dendrogram": False,
"heatmap": False,
}

samples: Samples = Samples()
samples.tail = 0
samples.random = 0


class Config:
arg_groups: Dict[str, Any] = {
"sensitive": {
Expand Down
5 changes: 2 additions & 3 deletions src/pandas_profiling/model/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ def supported_alerts(summary: dict) -> List[Alert]:
)
)
if summary.get("n_distinct", np.nan) == 1:
summary["mode"] = summary["value_counts_without_nan"].index[0]
alerts.append(
Alert(
alert_type=AlertType.CONSTANT,
Expand Down Expand Up @@ -379,11 +378,11 @@ def get_alerts(


def alert_value(value: float) -> bool:
return not np.isnan(value) and value > 0.01
return not pd.isna(value) and value > 0.01


def skewness_alert(v: float, threshold: int) -> bool:
return not np.isnan(v) and (v < (-1 * threshold) or v > threshold)
return not pd.isna(v) and (v < (-1 * threshold) or v > threshold)


def type_date_alert(series: pd.Series) -> bool:
Expand Down
4 changes: 0 additions & 4 deletions src/pandas_profiling/model/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,6 @@ def calculate_correlation(
Returns:
The correlation matrices for the given correlation measures. Return None if correlation is empty.
"""

if len(df) == 0:
return None

correlation_measures = {
"auto": Auto,
"pearson": Pearson,
Expand Down
43 changes: 24 additions & 19 deletions src/pandas_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,31 @@ def describe(
]
pbar.update()

# Get correlations
correlation_names = get_active_correlations(config)
pbar.total += len(correlation_names)

correlations = {
correlation_name: progress(
calculate_correlation, pbar, f"Calculate {correlation_name} correlation"
)(config, df, correlation_name, series_description)
for correlation_name in correlation_names
}
# Table statistics
table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(
config, df, series_description
)

# make sure correlations is not None
correlations = {
key: value for key, value in correlations.items() if value is not None
}
# Get correlations
if table_stats["n"] != 0:
correlation_names = get_active_correlations(config)
pbar.total += len(correlation_names)

correlations = {
correlation_name: progress(
calculate_correlation,
pbar,
f"Calculate {correlation_name} correlation",
)(config, df, correlation_name, series_description)
for correlation_name in correlation_names
}

# make sure correlations is not None
correlations = {
key: value for key, value in correlations.items() if value is not None
}
else:
correlations = {}

# Scatter matrix
pbar.set_postfix_str("Get scatter matrix")
Expand All @@ -118,11 +128,6 @@ def describe(
get_scatter_plot, pbar, f"scatter {x}, {y}"
)(config, df, x, y, interval_columns)

# Table statistics
table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(
config, df, series_description
)

# missing diagrams
missing_map = get_missing_active(config, table_stats)
pbar.total += len(missing_map)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def pandas_describe_counts(
"value_counts_without_nan": value_counts_without_nan,
}
)

try:
summary["value_counts_index_sorted"] = summary[
"value_counts_without_nan"
Expand Down
34 changes: 34 additions & 0 deletions src/pandas_profiling/model/spark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from pandas_profiling.model.spark import (
correlations_spark,
dataframe_spark,
describe_boolean_spark,
describe_categorical_spark,
describe_counts_spark,
describe_date_spark,
describe_generic_spark,
describe_numeric_spark,
describe_supported_spark,
duplicates_spark,
missing_spark,
sample_spark,
summary_spark,
table_spark,
)

__all__ = [
"correlations_spark",
"dataframe_spark",
"describe_boolean_spark",
"describe_categorical_spark",
"describe_counts_spark",
"describe_date_spark",
"describe_generic_spark",
"describe_numeric_spark",
"describe_supported_spark",
"duplicates_spark",
"missing_spark",
"sample_spark",
"sample_spark",
"summary_spark",
"table_spark",
]
Loading

0 comments on commit 4113389

Please sign in to comment.