feat: minimal spark deployment (#1132)

* Initial commit of spark-branch with all packed logic/tests in semi-working state
ydataai · Jan 30, 2023 · 4113389 · 4113389
1 parent a105d32
commit 4113389
Show file tree

Hide file tree

Showing 39 changed files with 1,935 additions and 49 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -131,4 +131,91 @@ jobs:
 
     - run: make test_cov
 
+    - uses: actions/cache@v2
+      if: startsWith(runner.os, 'Windows')
+      with:
+        path: ~\AppData\Local\pip\Cache
+        key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-${{ matrix.pandas }}-pip-
+    - run: |
+        pip install --upgrade pip setuptools wheel
+        pip install -r requirements.txt "${{ matrix.pandas }}" "${{ matrix.numpy }}"
+        pip install -r requirements-test.txt
+    - run: make install
+    - run: make test_cov
     - run: codecov -F py${{ matrix.python-version }}-${{ matrix.os }}-${{ matrix.pandas }}-${{ matrix.numpy }}
+
+
+  test_spark:
+    runs-on: ${{ matrix.os }}
+    continue-on-error: True
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [3.7, 3.8]
+        pandas: ["pandas==0.25.3", "pandas==1.0.5", "pandas>1.1"]
+        spark: ["2.3.0", "2.4.7", "3.0.1"]
+        hadoop: [ 2.7 ]
+        numpy: ["numpy==1.19.5"]
+        java_home: [ /usr/lib/jvm/java-8-openjdk-amd64 ]
+        exclude:
+           - python-version: 3.8
+             spark: "2.3.0"
+           - python-version: 3.8
+             spark: "2.4.7"
+#          - os: macos-latest
+#            python-version: 3.6
+#            pandas: ">1.1"
+#          - os: windows-2016
+#            python-version: 3.6
+#            pandas: ">1.1"
+
+    name: Tests Spark | python ${{ matrix.python-version }}, ${{ matrix.os }}, spark${{ matrix.spark }}, ${{ matrix.pandas }}, ${{ matrix.numpy }}
+    env:
+      JAVA_HOME: ${{ matrix.java_home }}
+      SPARK_VERSION: ${{ matrix.spark }}
+      HADOOP_VERSION: ${{ matrix.hadoop }}
+      SPARK_DIRECTORY: ${{ github.workspace }}/../
+      SPARK_HOME: ${{ github.workspace }}/../spark/
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'Linux')
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-\
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'macOS')
+        with:
+          path: ~/Library/Caches/pip
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-
+      - uses: actions/cache@v2
+        if: startsWith(runner.os, 'Windows')
+        with:
+          path: ~\AppData\Local\pip\Cache
+          key: ${{ runner.os }}-${{ matrix.pandas }}-pip-${{ hashFiles('**/requirements.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-${{ matrix.pandas }}-pip-
+      - run: |
+          pip install --upgrade pip setuptools wheel
+          pip install pytest-spark>=0.6.0 pyarrow==1.0.1 pyspark=="${{ matrix.spark }}"
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+          pip install "${{ matrix.pandas }}" "${{ matrix.numpy }}"
+      - if: ${{ matrix.spark != '3.0.1' }}
+        run: echo "ARROW_PRE_0_15_IPC_FORMAT=1" >> $GITHUB_ENV
+      - run: echo "SPARK_LOCAL_IP=127.0.0.1" >> $GITHUB_ENV
+      - run: make install
+      - run: make install-spark-ci
+      - run: make test_spark
+
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -13,6 +13,9 @@ recursive-include src/pandas_profiling/report/presentation/flavours/html/templat
 # Configuration
 include src/pandas_profiling/*.yaml
 
+# Spark Dev venv
+recursive-include venv *.yml
+
 # Exclude development, docs, testing and example code
 exclude .pre-commit-config.yaml
 exclude commitlint.config.js

diff --git a/Makefile b/Makefile
@@ -12,6 +12,10 @@ test:
 	pytest --nbval tests/notebooks/
 	pandas_profiling -h
 
+test_spark:
+	pytest --spark_home=${SPARK_HOME} tests/backends/spark_backend/
+	pandas_profiling -h
+
 test_cov:
 	pytest --cov=. tests/unit/
 	pytest --cov=. --cov-append tests/issues/
@@ -30,6 +34,13 @@ package:
 install:
 	pip install -e .[notebook]
 
+install-spark-ci:
+	sudo apt-get update
+	sudo apt-get -y install openjdk-8-jdk
+	curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+	--output ${SPARK_DIRECTORY}/spark.tgz
+	cd ${SPARK_DIRECTORY} && tar -xvzf spark.tgz && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
+
 lint:
 	pre-commit run --all-files
 

diff --git a/requirements-spark.txt b/requirements-spark.txt
@@ -0,0 +1,5 @@
+# this provides the recommended pyspark and pyarrow versions for spark to work on pandas-profiling
+# note that if you are using pyspark 2.3 or 2.4 and pyarrow >= 0.15, you might need to
+# set ARROW_PRE_0_15_IPC_FORMAT=1 in your conf/spark-env.sh for toPandas functions to work properly
+pyspark>=2.3.0
+pyarrow>=2.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -2,6 +2,7 @@ pytest
 coverage~=6.5
 codecov
 pytest-cov
+pytest-spark
 nbval
 pyarrow
 twine>=3.1.1

diff --git a/src/pandas_profiling/__init__.py b/src/pandas_profiling/__init__.py
@@ -2,6 +2,7 @@
 
 .. include:: ../../README.md
 """
+import importlib.util
 
 from pandas_profiling.compare_reports import compare
 from pandas_profiling.controller import pandas_decorator
@@ -11,6 +12,10 @@
 # backend
 import pandas_profiling.model.pandas  # isort:skip  # noqa
 
+spec = importlib.util.find_spec("pyspark")
+if spec is not None:
+    import pandas_profiling.model.spark  # isort:skip  # noqa
+
 
 __all__ = [
     "pandas_decorator",

diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py
@@ -351,6 +351,42 @@ def from_file(config_file: Union[Path, str]) -> "Settings":
         return Settings().parse_obj(data)
 
 
+class SparkSettings(Settings):
+    # TO-DO write description
+    vars: Univariate = Univariate()
+
+    vars.num.low_categorical_threshold = 0
+
+    infer_dtypes = False
+
+    correlations: Dict[str, Correlation] = {
+        "spearman": Correlation(key="spearman"),
+        "pearson": Correlation(key="pearson"),
+        "kendall": Correlation(key="kendall"),
+        "cramers": Correlation(key="cramers"),
+        "phi_k": Correlation(key="phi_k"),
+    }
+    correlations["pearson"].calculate = True
+    correlations["spearman"].calculate = True
+    correlations["kendall"].calculate = False
+    correlations["cramers"].calculate = False
+    correlations["phi_k"].calculate = False
+
+    interactions: Interactions = Interactions()
+    interactions.continuous = False
+
+    missing_diagrams: Dict[str, bool] = {
+        "bar": False,
+        "matrix": False,
+        "dendrogram": False,
+        "heatmap": False,
+    }
+
+    samples: Samples = Samples()
+    samples.tail = 0
+    samples.random = 0
+
+
 class Config:
     arg_groups: Dict[str, Any] = {
         "sensitive": {

diff --git a/src/pandas_profiling/model/alerts.py b/src/pandas_profiling/model/alerts.py
@@ -285,7 +285,6 @@ def supported_alerts(summary: dict) -> List[Alert]:
             )
         )
     if summary.get("n_distinct", np.nan) == 1:
-        summary["mode"] = summary["value_counts_without_nan"].index[0]
         alerts.append(
             Alert(
                 alert_type=AlertType.CONSTANT,
@@ -379,11 +378,11 @@ def get_alerts(
 
 
 def alert_value(value: float) -> bool:
-    return not np.isnan(value) and value > 0.01
+    return not pd.isna(value) and value > 0.01
 
 
 def skewness_alert(v: float, threshold: int) -> bool:
-    return not np.isnan(v) and (v < (-1 * threshold) or v > threshold)
+    return not pd.isna(v) and (v < (-1 * threshold) or v > threshold)
 
 
 def type_date_alert(series: pd.Series) -> bool:

diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
@@ -89,10 +89,6 @@ def calculate_correlation(
     Returns:
         The correlation matrices for the given correlation measures. Return None if correlation is empty.
     """
-
-    if len(df) == 0:
-        return None
-
     correlation_measures = {
         "auto": Auto,
         "pearson": Pearson,

diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py
@@ -90,21 +90,31 @@ def describe(
         ]
         pbar.update()
 
-        # Get correlations
-        correlation_names = get_active_correlations(config)
-        pbar.total += len(correlation_names)
-
-        correlations = {
-            correlation_name: progress(
-                calculate_correlation, pbar, f"Calculate {correlation_name} correlation"
-            )(config, df, correlation_name, series_description)
-            for correlation_name in correlation_names
-        }
+        # Table statistics
+        table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(
+            config, df, series_description
+        )
 
-        # make sure correlations is not None
-        correlations = {
-            key: value for key, value in correlations.items() if value is not None
-        }
+        # Get correlations
+        if table_stats["n"] != 0:
+            correlation_names = get_active_correlations(config)
+            pbar.total += len(correlation_names)
+
+            correlations = {
+                correlation_name: progress(
+                    calculate_correlation,
+                    pbar,
+                    f"Calculate {correlation_name} correlation",
+                )(config, df, correlation_name, series_description)
+                for correlation_name in correlation_names
+            }
+
+            # make sure correlations is not None
+            correlations = {
+                key: value for key, value in correlations.items() if value is not None
+            }
+        else:
+            correlations = {}
 
         # Scatter matrix
         pbar.set_postfix_str("Get scatter matrix")
@@ -118,11 +128,6 @@ def describe(
                 get_scatter_plot, pbar, f"scatter {x}, {y}"
             )(config, df, x, y, interval_columns)
 
-        # Table statistics
-        table_stats = progress(get_table_stats, pbar, "Get dataframe statistics")(
-            config, df, series_description
-        )
-
         # missing diagrams
         missing_map = get_missing_active(config, table_stats)
         pbar.total += len(missing_map)

diff --git a/src/pandas_profiling/model/pandas/describe_counts_pandas.py b/src/pandas_profiling/model/pandas/describe_counts_pandas.py
@@ -45,6 +45,7 @@ def pandas_describe_counts(
                 "value_counts_without_nan": value_counts_without_nan,
             }
         )
+
         try:
             summary["value_counts_index_sorted"] = summary[
                 "value_counts_without_nan"

diff --git a/src/pandas_profiling/model/spark/__init__.py b/src/pandas_profiling/model/spark/__init__.py
@@ -0,0 +1,34 @@
+from pandas_profiling.model.spark import (
+    correlations_spark,
+    dataframe_spark,
+    describe_boolean_spark,
+    describe_categorical_spark,
+    describe_counts_spark,
+    describe_date_spark,
+    describe_generic_spark,
+    describe_numeric_spark,
+    describe_supported_spark,
+    duplicates_spark,
+    missing_spark,
+    sample_spark,
+    summary_spark,
+    table_spark,
+)
+
+__all__ = [
+    "correlations_spark",
+    "dataframe_spark",
+    "describe_boolean_spark",
+    "describe_categorical_spark",
+    "describe_counts_spark",
+    "describe_date_spark",
+    "describe_generic_spark",
+    "describe_numeric_spark",
+    "describe_supported_spark",
+    "duplicates_spark",
+    "missing_spark",
+    "sample_spark",
+    "sample_spark",
+    "summary_spark",
+    "table_spark",
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -45,6 +45,7 @@ def pandas_describe_counts( @@
                     "value_counts_without_nan": value_counts_without_nan,
                 }
             )
             try:
                 summary["value_counts_index_sorted"] = summary[
                     "value_counts_without_nan"
@@ Expand Down @@