From be549bb61e0846089b9190018b1f6411936410d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:01:28 +0000 Subject: [PATCH 01/20] Add Transformation and rename package (#18) --- CONTRIBUTING.md | 6 +- README.md | 47 +-- .../{teckle => teckel}/api/etl/Run.scala | 12 +- .../{teckle => teckel}/api/etl/package.scala | 4 +- .../api/spark/SparkETL.scala | 5 +- .../{teckle => teckel}/api/ExampleSpec.scala | 8 +- build.sbt | 15 +- docs/etl/complex.yaml | 41 +++ docs/etl/group-by.yaml | 24 ++ docs/etl/order-by.yaml | 22 ++ docs/etl/select.yaml | 21 ++ docs/etl/where.yaml | 19 ++ docs/formal-definition.md | 35 +++ .../{teckle-banner.png => teckel-banner.png} | Bin ...e-logo-small.png => teckel-logo-small.png} | Bin .../{teckle-logo.png => teckel-logo.png} | Bin .../api/example/EffectExample.scala | 10 +- .../api/example/Example.scala | 10 +- .../api/example/UnsafeExample.scala | 10 +- .../{teckle => teckel}/model/Asset.scala | 2 +- .../com/eff3ct/teckel/model/Source.scala | 59 ++++ .../{teckle => teckel}/model/package.scala | 6 +- project/BuildPlugin.scala | 7 +- project/Dependency.scala | 13 +- project/Library.scala | 3 + project/Version.scala | 20 +- .../semantic/core}/EvalAsset.scala | 8 +- .../semantic/core}/EvalContext.scala | 4 +- .../teckel/semantic/core/Semantic.scala | 38 +++ .../semantic/evaluation.scala | 29 +- .../semantic/execution.scala | 33 +- .../{teckle => teckel}/semantic/package.scala | 10 +- .../teckel/semantic/sources/Debug.scala | 81 +++++ .../eff3ct/teckel/semantic/sources/Exec.scala | 47 +++ .../teckel/semantic/sources/package.scala | 17 +- .../src/test/resources/data/csv/example.csv | 29 ++ .../eff3ct/teckel/semantic/DebugSource.scala | 111 +++++++ .../teckel/semantic/SparkTestUtils.scala | 39 +++ .../serializer/Serializer.scala | 2 +- .../serializer/alternative.scala | 2 +- .../eff3ct/teckel/serializer/model/etl.scala | 48 +++ .../teckel/serializer/model/input.scala | 56 ++++ .../teckel/serializer/model/operations.scala | 65 ++++ .../serializer/model/output.scala} | 34 +- .../serializer/model/transformation.scala | 61 ++++ .../serializer/package.scala | 2 +- .../serializer/types/PrimitiveType.scala | 2 +- .../serializer/types/implicits.scala | 4 +- .../com/eff3ct/teckel/transform/Rewrite.scala | 101 ++++++ .../com/eff3ct/teckle/transform/Rewrite.scala | 58 ---- serializer/src/test/resources/complex.json | 53 ++++ serializer/src/test/resources/complex.yaml | 43 +++ .../serializer/DefaultSerializerSpec.scala | 276 +++++++++++++++++ .../teckel/serializer/ExampleSpec.scala | 133 ++++++++ .../jsonspec/JsonSerializerSpec.scala | 292 ++++++++++++++++++ .../yamlspec/YamlSerializerSpec.scala | 279 +++++++++++++++++ .../serializer/DefaultSerializerSpec.scala | 106 ------- .../teckle/serializer/ExampleSpec.scala | 93 ------ .../jsonspec/JsonSerializerSpec.scala | 118 ------- .../yamlspec/YamlSerializerSpec.scala | 106 ------- 60 files changed, 2121 insertions(+), 658 deletions(-) rename api/src/main/scala/com/eff3ct/{teckle => teckel}/api/etl/Run.scala (90%) rename api/src/main/scala/com/eff3ct/{teckle => teckel}/api/etl/package.scala (94%) rename api/src/main/scala/com/eff3ct/{teckle => teckel}/api/spark/SparkETL.scala (94%) rename api/src/test/scala/com/eff3ct/{teckle => teckel}/api/ExampleSpec.scala (92%) create mode 100644 docs/etl/complex.yaml create mode 100644 docs/etl/group-by.yaml create mode 100644 docs/etl/order-by.yaml create mode 100644 docs/etl/select.yaml create mode 100644 docs/etl/where.yaml create mode 100644 docs/formal-definition.md rename docs/images/{teckle-banner.png => teckel-banner.png} (100%) rename docs/images/{teckle-logo-small.png => teckel-logo-small.png} (100%) rename docs/images/{teckle-logo.png => teckel-logo.png} (100%) rename example/src/main/scala/com/eff3ct/{teckle => teckel}/api/example/EffectExample.scala (87%) rename example/src/main/scala/com/eff3ct/{teckle => teckel}/api/example/Example.scala (87%) rename example/src/main/scala/com/eff3ct/{teckle => teckel}/api/example/UnsafeExample.scala (87%) rename model/src/main/scala/com/eff3ct/{teckle => teckel}/model/Asset.scala (97%) create mode 100644 model/src/main/scala/com/eff3ct/teckel/model/Source.scala rename model/src/main/scala/com/eff3ct/{teckle => teckel}/model/package.scala (92%) rename semantic/src/main/scala/com/eff3ct/{teckle/semantic => teckel/semantic/core}/EvalAsset.scala (89%) rename semantic/src/main/scala/com/eff3ct/{teckle/semantic => teckel/semantic/core}/EvalContext.scala (93%) create mode 100644 semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala rename semantic/src/main/scala/com/eff3ct/{teckle => teckel}/semantic/evaluation.scala (67%) rename semantic/src/main/scala/com/eff3ct/{teckle => teckel}/semantic/execution.scala (65%) rename semantic/src/main/scala/com/eff3ct/{teckle => teckel}/semantic/package.scala (86%) create mode 100644 semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala create mode 100644 semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala rename model/src/main/scala/com/eff3ct/teckle/model/Source.scala => semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala (78%) create mode 100644 semantic/src/test/resources/data/csv/example.csv create mode 100644 semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala create mode 100644 semantic/src/test/scala/com/eff3ct/teckel/semantic/SparkTestUtils.scala rename serializer/src/main/scala/com/eff3ct/{teckle => teckel}/serializer/Serializer.scala (97%) rename serializer/src/main/scala/com/eff3ct/{teckle => teckel}/serializer/alternative.scala (97%) create mode 100644 serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala create mode 100644 serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala create mode 100644 serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala rename serializer/src/main/scala/com/eff3ct/{teckle/serializer/model.scala => teckel/serializer/model/output.scala} (68%) create mode 100644 serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala rename serializer/src/main/scala/com/eff3ct/{teckle => teckel}/serializer/package.scala (97%) rename serializer/src/main/scala/com/eff3ct/{teckle => teckel}/serializer/types/PrimitiveType.scala (97%) rename serializer/src/main/scala/com/eff3ct/{teckle => teckel}/serializer/types/implicits.scala (96%) create mode 100644 serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala delete mode 100644 serializer/src/main/scala/com/eff3ct/teckle/transform/Rewrite.scala create mode 100644 serializer/src/test/resources/complex.json create mode 100644 serializer/src/test/resources/complex.yaml create mode 100644 serializer/src/test/scala/com/eff3ct/teckel/serializer/DefaultSerializerSpec.scala create mode 100644 serializer/src/test/scala/com/eff3ct/teckel/serializer/ExampleSpec.scala create mode 100644 serializer/src/test/scala/com/eff3ct/teckel/serializer/jsonspec/JsonSerializerSpec.scala create mode 100644 serializer/src/test/scala/com/eff3ct/teckel/serializer/yamlspec/YamlSerializerSpec.scala delete mode 100644 serializer/src/test/scala/com/eff3ct/teckle/serializer/DefaultSerializerSpec.scala delete mode 100644 serializer/src/test/scala/com/eff3ct/teckle/serializer/ExampleSpec.scala delete mode 100644 serializer/src/test/scala/com/eff3ct/teckle/serializer/jsonspec/JsonSerializerSpec.scala delete mode 100644 serializer/src/test/scala/com/eff3ct/teckle/serializer/yamlspec/YamlSerializerSpec.scala diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index bcc6be6..a6ee418 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,8 +17,8 @@ Fork the repository on GitHub by clicking the "Fork" button at the top right of Clone your fork to your local machine: ```sh -git clone https://github.com/your-username/teckle.git -cd teckle +git clone https://github.com/your-username/teckel.git +cd teckel ``` ### 3. Create a Branch @@ -63,4 +63,4 @@ If you find a bug or have a feature request, please create an issue on GitHub. P ## Thank You! -Thank you for contributing to Teckle! Your help is greatly appreciated. \ No newline at end of file +Thank you for contributing to Teckel! Your help is greatly appreciated. \ No newline at end of file diff --git a/README.md b/README.md index 1c3b077..605c260 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Teckel is a framework designed to simplify the creation of Apache Spark ETL (Ext Load) processes using YAML configuration files. This tool aims to standardize and streamline ETL workflow creation by enabling the definition of data transformations in a declarative, user-friendly format without writing extensive code. -![Logo](./docs/images/teckle-banner.png) +![Logo](./docs/images/teckel-banner.png) This concept is further developed on my blog: [Big Data with Zero Code](https://blog.rafaelfernandez.dev/posts/big-data-with-zero-code/) @@ -16,42 +16,6 @@ blog: [Big Data with Zero Code](https://blog.rafaelfernandez.dev/posts/big-data- - **Flexible Transformations:** Perform joins, aggregations, and selections with clear syntax. - **Spark Compatibility:** Leverage the power of Apache Spark for large-scale data processing. -## Formal Language Definition - -Teckel uses a specific set of language constructs to define data flows. Below is the formal syntax for this DSL: - -```txt -Asset := `Asset` - -Source := | | -Input := `Input` -Output := `Output` - -// TODO: It need double-check and define correctly -Transformation ::= JoinOperation | GroupOperation | WindowOperation - -// Join -JoinOperation ::= `Join` -JoinType ::= `Inner` | `Left` | `Right` | `Cross` | ... -JoinRelation ::= `JoinRelation` [ ] -RelationField ::= `RelationField` - -// Group -GroupOperation ::= `Group` -By ::= `By` [Column] -Agg ::= `Agg` [Column] - -Select ::= `Select` [Column] -Where ::= `Where` [Column] - -// Type Alias -AssetRef := String -Format := String -SourceRef := String -Options := `Map` String String -Context := `Map` -``` - ## Getting Started ### Prerequisites @@ -80,9 +44,18 @@ Once you have installed Teckel, you can use it to run ETL processes. Here's an example of a fully defined ETL configuration using a YAML file: +### SQL ETL - Simple Example: [here](./docs/etl/simple.yaml) +- Complex Example: [here](./docs/etl/complex.yaml) - Other Example: [here](./docs/etl/example.yaml) +### SQL Transformations +- `Select` Example: [here](./docs/etl/select.yaml) +- `Where` Example: [here](./docs/etl/where.yaml) +- `Group By` Example: [here](./docs/etl/group-by.yaml) +- `Order By` Example: [here](./docs/etl/order-by.yaml) + + ## Development and Contribution Contributions to Teckel are welcome. If you'd like to contribute, please fork the repository and create a pull request diff --git a/api/src/main/scala/com/eff3ct/teckle/api/etl/Run.scala b/api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala similarity index 90% rename from api/src/main/scala/com/eff3ct/teckle/api/etl/Run.scala rename to api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala index 798c29f..3f86cb2 100644 --- a/api/src/main/scala/com/eff3ct/teckle/api/etl/Run.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala @@ -22,22 +22,22 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api.etl +package com.eff3ct.teckel.api.etl import cats.effect.IO import cats.effect.unsafe.implicits.global import cats.implicits._ import cats.{Id, MonadThrow} +import com.eff3ct.teckel.semantic.core.EvalContext +import com.eff3ct.teckel.serializer._ +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.transform.Rewrite import fs2.io.file.{Files, Path} -import com.eff3ct.teckle.semantic.EvalContext -import com.eff3ct.teckle.serializer._ -import com.eff3ct.teckle.serializer.model.ETL -import com.eff3ct.teckle.transform.Rewrite trait Run[F[_]] { def run[O: EvalContext](path: String): F[O] - } + object Run { def apply[F[_]: Run]: Run[F] = implicitly[Run[F]] diff --git a/api/src/main/scala/com/eff3ct/teckle/api/etl/package.scala b/api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala similarity index 94% rename from api/src/main/scala/com/eff3ct/teckle/api/etl/package.scala rename to api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala index 7c58374..8b27270 100644 --- a/api/src/main/scala/com/eff3ct/teckle/api/etl/package.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala @@ -22,12 +22,12 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api +package com.eff3ct.teckel.api import cats.Id import cats.effect.IO +import com.eff3ct.teckel.semantic.core.EvalContext import fs2.Compiler -import com.eff3ct.teckle.semantic.EvalContext package object etl { diff --git a/api/src/main/scala/com/eff3ct/teckle/api/spark/SparkETL.scala b/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala similarity index 94% rename from api/src/main/scala/com/eff3ct/teckle/api/spark/SparkETL.scala rename to api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala index e0ed06f..379340b 100644 --- a/api/src/main/scala/com/eff3ct/teckle/api/spark/SparkETL.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api.spark +package com.eff3ct.teckel.api.spark import cats.effect.{ExitCode, IO, IOApp} import org.apache.spark.SparkConf @@ -38,7 +38,6 @@ trait SparkETL extends IOApp { /** * Builds a Spark session - * @param config ETL configuration * @return Spark session */ private final def sparkBuilder(): SparkSession = { @@ -56,7 +55,6 @@ trait SparkETL extends IOApp { /** * Run the ETL. This method should be implemented by the ETL. - * @param config ETL configuration * @param spark Spark session * @param logger logger */ @@ -67,7 +65,6 @@ trait SparkETL extends IOApp { /** * Run the ETL using IO - * @param config ETL configuration * @param spark Spark session * @param logger logger * @return IO diff --git a/api/src/test/scala/com/eff3ct/teckle/api/ExampleSpec.scala b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala similarity index 92% rename from api/src/test/scala/com/eff3ct/teckle/api/ExampleSpec.scala rename to api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala index 06284c3..3d961b9 100644 --- a/api/src/test/scala/com/eff3ct/teckle/api/ExampleSpec.scala +++ b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala @@ -22,12 +22,12 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api +package com.eff3ct.teckel.api import cats.effect.unsafe.implicits.global -import com.eff3ct.teckle.api.etl.{etl, unsafeETL} -import com.eff3ct.teckle.semantic.evaluation._ -import com.eff3ct.teckle.semantic.execution._ +import com.eff3ct.teckel.api.etl.{etl, unsafeETL} +import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.scalatest.flatspec.AnyFlatSpecLike diff --git a/build.sbt b/build.sbt index e3664d1..c0af8c0 100644 --- a/build.sbt +++ b/build.sbt @@ -1,9 +1,10 @@ +import Dependency.ProjectOps lazy val root = (project in file(".")) .disablePlugins(BuildPlugin, AssemblyPlugin, HeaderPlugin) .settings( - name := "teckle", + name := "teckel", publish / skip := true ) .aggregate( @@ -22,7 +23,7 @@ lazy val root = lazy val model = (project in file("./model")) .settings( - name := "teckle-model", + name := "teckel-model", libraryDependencies ++= Dependency.model ) @@ -30,16 +31,16 @@ lazy val semantic = (project in file("./semantic")) .dependsOn(model) .settings( - name := "teckle-semantic", + name := "teckel-semantic", libraryDependencies ++= Dependency.semantic - ) + ).withKindProjector /** Serializer */ lazy val serializer = (project in file("./serializer")) .dependsOn(model) .settings( - name := "teckle-serializer", + name := "teckel-serializer", publish / skip := false, libraryDependencies ++= Dependency.serializer ) @@ -48,7 +49,7 @@ lazy val api = (project in file("./api")) .dependsOn(serializer, semantic) .settings( - name := "teckle-api", + name := "teckel-api", publish / skip := false, libraryDependencies ++= Dependency.api ) @@ -57,5 +58,5 @@ lazy val example = (project in file("./example")) .dependsOn(api) .settings( - name := "teckle-example" + name := "teckel-example" ) diff --git a/docs/etl/complex.yaml b/docs/etl/complex.yaml new file mode 100644 index 0000000..db9bcba --- /dev/null +++ b/docs/etl/complex.yaml @@ -0,0 +1,41 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: selectTable1 + select: + from: table1 + columns: + - col1 + - col2 + - name: whereTable1 + where: + from: selectTable1 + filter: 'col1 > 10' + - name: groupByTable1 + group: + from: whereTable1 + by: + - col1 + - col2 + agg: + - sum(col1) + - max(col2) + - name: orderByTable1 + order: + from: groupByTable1 + by: + - col1 + - col2 + order: Desc + +output: + - name: orderByTable1 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/docs/etl/group-by.yaml b/docs/etl/group-by.yaml new file mode 100644 index 0000000..82631f9 --- /dev/null +++ b/docs/etl/group-by.yaml @@ -0,0 +1,24 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: groupByTable1 + group: + from: table1 + by: + - col1 + - col2 + agg: + - sum(col1) + - max(col2) + - +output: + - name: groupByTable1 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/docs/etl/order-by.yaml b/docs/etl/order-by.yaml new file mode 100644 index 0000000..52241e0 --- /dev/null +++ b/docs/etl/order-by.yaml @@ -0,0 +1,22 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: orderByTable1 + order: + from: table1 + by: + - col1 + - col2 + order: Desc + +output: + - name: orderByTable1 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/docs/etl/select.yaml b/docs/etl/select.yaml new file mode 100644 index 0000000..67ced58 --- /dev/null +++ b/docs/etl/select.yaml @@ -0,0 +1,21 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: selectTable1 + select: + from: table1 + columns: + - col1 + - col2 + +output: + - name: selectTable1 + format: parquet + mode: overwrite + path: 'data/parquet/select_table1' \ No newline at end of file diff --git a/docs/etl/where.yaml b/docs/etl/where.yaml new file mode 100644 index 0000000..935653e --- /dev/null +++ b/docs/etl/where.yaml @@ -0,0 +1,19 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: whereTable1 + where: + from: table1 + filter: 'col1 > 10' + +output: + - name: whereTable1 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/docs/formal-definition.md b/docs/formal-definition.md new file mode 100644 index 0000000..549cc7a --- /dev/null +++ b/docs/formal-definition.md @@ -0,0 +1,35 @@ +# Formal Language Definition + +Teckel uses a specific set of language constructs to define data flows. Below is the formal syntax for this DSL: + +```txt +Asset := `Asset` + +Source := | | +Input := `Input` +Output := `Output` + +// TODO: It need double-check and define correctly +Transformation ::= JoinOperation | GroupOperation | WindowOperation + +// Join +JoinOperation ::= `Join` +JoinType ::= `Inner` | `Left` | `Right` | `Cross` | ... +JoinRelation ::= `JoinRelation` [ ] +RelationField ::= `RelationField` + +// Group +GroupOperation ::= `Group` +By ::= `By` [Column] +Agg ::= `Agg` [Column] + +Select ::= `Select` [Column] +Where ::= `Where` [Column] + +// Type Alias +AssetRef := String +Format := String +SourceRef := String +Options := `Map` String String +Context := `Map` +``` diff --git a/docs/images/teckle-banner.png b/docs/images/teckel-banner.png similarity index 100% rename from docs/images/teckle-banner.png rename to docs/images/teckel-banner.png diff --git a/docs/images/teckle-logo-small.png b/docs/images/teckel-logo-small.png similarity index 100% rename from docs/images/teckle-logo-small.png rename to docs/images/teckel-logo-small.png diff --git a/docs/images/teckle-logo.png b/docs/images/teckel-logo.png similarity index 100% rename from docs/images/teckle-logo.png rename to docs/images/teckel-logo.png diff --git a/example/src/main/scala/com/eff3ct/teckle/api/example/EffectExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala similarity index 87% rename from example/src/main/scala/com/eff3ct/teckle/api/example/EffectExample.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala index 1f7dc8d..1ae6097 100644 --- a/example/src/main/scala/com/eff3ct/teckle/api/example/EffectExample.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala @@ -22,13 +22,13 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api.example +package com.eff3ct.teckel.api.example import cats.effect.IO -import com.eff3ct.teckle.api.etl.etlF -import com.eff3ct.teckle.api.spark.SparkETL -import com.eff3ct.teckle.semantic.evaluation._ -import com.eff3ct.teckle.semantic.execution._ +import com.eff3ct.teckel.api.etl.etlF +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger diff --git a/example/src/main/scala/com/eff3ct/teckle/api/example/Example.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala similarity index 87% rename from example/src/main/scala/com/eff3ct/teckle/api/example/Example.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala index 412aa7a..f97759d 100644 --- a/example/src/main/scala/com/eff3ct/teckle/api/example/Example.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala @@ -22,13 +22,13 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api.example +package com.eff3ct.teckel.api.example import cats.effect.IO -import com.eff3ct.teckle.api.etl.etl -import com.eff3ct.teckle.api.spark.SparkETL -import com.eff3ct.teckle.semantic.evaluation._ -import com.eff3ct.teckle.semantic.execution._ +import com.eff3ct.teckel.api.etl.etl +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger diff --git a/example/src/main/scala/com/eff3ct/teckle/api/example/UnsafeExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala similarity index 87% rename from example/src/main/scala/com/eff3ct/teckle/api/example/UnsafeExample.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala index fd5dd1f..9cfa277 100644 --- a/example/src/main/scala/com/eff3ct/teckle/api/example/UnsafeExample.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala @@ -22,12 +22,12 @@ * SOFTWARE. */ -package com.eff3ct.teckle.api.example +package com.eff3ct.teckel.api.example -import com.eff3ct.teckle.api.etl.unsafeETL -import com.eff3ct.teckle.api.spark.SparkETL -import com.eff3ct.teckle.semantic.evaluation._ -import com.eff3ct.teckle.semantic.execution._ +import com.eff3ct.teckel.api.etl.unsafeETL +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger diff --git a/model/src/main/scala/com/eff3ct/teckle/model/Asset.scala b/model/src/main/scala/com/eff3ct/teckel/model/Asset.scala similarity index 97% rename from model/src/main/scala/com/eff3ct/teckle/model/Asset.scala rename to model/src/main/scala/com/eff3ct/teckel/model/Asset.scala index 2aead64..336cec5 100644 --- a/model/src/main/scala/com/eff3ct/teckle/model/Asset.scala +++ b/model/src/main/scala/com/eff3ct/teckel/model/Asset.scala @@ -22,6 +22,6 @@ * SOFTWARE. */ -package com.eff3ct.teckle.model +package com.eff3ct.teckel.model case class Asset(assetRef: AssetRef, source: Source) diff --git a/model/src/main/scala/com/eff3ct/teckel/model/Source.scala b/model/src/main/scala/com/eff3ct/teckel/model/Source.scala new file mode 100644 index 0000000..09d432d --- /dev/null +++ b/model/src/main/scala/com/eff3ct/teckel/model/Source.scala @@ -0,0 +1,59 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.model + +import cats.data.NonEmptyList + +sealed trait Source + +object Source { + + case class Input(format: Format, options: Options, sourceRef: SourceRef) extends Source + + case class Output( + assetRef: AssetRef, + format: Format, + mode: Mode, + options: Options, + sourceRef: SourceRef + ) extends Source + with WithAssetRef + + trait WithAssetRef { + def assetRef: AssetRef + } + + sealed trait Transformation extends Source with WithAssetRef + + case class Select(assetRef: AssetRef, columns: NonEmptyList[Column]) extends Transformation + + case class Where(assetRef: AssetRef, condition: Condition) extends Transformation + + case class GroupBy(assetRef: AssetRef, by: NonEmptyList[Column], aggregate: NonEmptyList[Column]) + extends Transformation + + case class OrderBy(assetRef: AssetRef, by: NonEmptyList[Column], order: Option[Order]) + extends Transformation +} diff --git a/model/src/main/scala/com/eff3ct/teckle/model/package.scala b/model/src/main/scala/com/eff3ct/teckel/model/package.scala similarity index 92% rename from model/src/main/scala/com/eff3ct/teckle/model/package.scala rename to model/src/main/scala/com/eff3ct/teckel/model/package.scala index 0957bf5..4aa8cde 100644 --- a/model/src/main/scala/com/eff3ct/teckle/model/package.scala +++ b/model/src/main/scala/com/eff3ct/teckel/model/package.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle +package com.eff3ct.teckel package object model { @@ -32,4 +32,8 @@ package object model { type Mode = String type Options = Map[String, String] type Context[T] = Map[AssetRef, T] + + type Column = String + type Condition = String + type Order = String } diff --git a/project/BuildPlugin.scala b/project/BuildPlugin.scala index 7c4d16d..6db2389 100644 --- a/project/BuildPlugin.scala +++ b/project/BuildPlugin.scala @@ -12,8 +12,10 @@ object BuildPlugin extends AutoPlugin { lazy val localJvmSettings: Seq[String] = Seq( - "-Xms512M", - "-Xmx2048M", + "-Xms8G", + "-Xmx8G", + "-XX:MaxPermSize=4048M", + "-XX:+CMSClassUnloadingEnabled", "-Duser.timezone=GMT", "-XX:+PrintCommandLineFlags", "-XX:+CMSClassUnloadingEnabled" @@ -31,6 +33,7 @@ object BuildPlugin extends AutoPlugin { run / javaOptions ++= localJvmSettings, run / fork := true, Test / fork := true, + parallelExecution in Test := false, headerLicense := Some(headerIOLicense), scalacOptions ++= Vector( // "-release:11", diff --git a/project/Dependency.scala b/project/Dependency.scala index 91510e5..73bdbff 100644 --- a/project/Dependency.scala +++ b/project/Dependency.scala @@ -9,7 +9,9 @@ object Dependency { lazy val model: Seq[ModuleID] = Seq( - estatico.newtype + estatico.newtype, + cats.core, + cats.laws ) lazy val semantic: Seq[ModuleID] = @@ -35,9 +37,16 @@ object Dependency { lazy val testing: Seq[ModuleID] = Seq( - test.scalaTest + test.scalaTest, + holdenkarau.sparktest ).map(d => d % "test") lazy val api: Seq[ModuleID] = testing + implicit class ProjectOps(val prj: Project) extends AnyVal { + def withKindProjector: Project = prj.settings( + addCompilerPlugin("org.typelevel" % "kind-projector" % "0.13.2" cross CrossVersion.full) + ) + } + } diff --git a/project/Library.scala b/project/Library.scala index 0423beb..f1c22d9 100644 --- a/project/Library.scala +++ b/project/Library.scala @@ -53,5 +53,8 @@ object Library { object test { lazy val scalaTest: ModuleID = "org.scalatest" %% "scalatest" % Version.ScalaTest } + object holdenkarau { + lazy val sparktest: ModuleID = "com.holdenkarau" %% "spark-testing-base" % Version.HoldenVersion + } } diff --git a/project/Version.scala b/project/Version.scala index 4f45bf0..b5b1083 100644 --- a/project/Version.scala +++ b/project/Version.scala @@ -1,20 +1,22 @@ object Version { - lazy val Scala: String = "2.13.12" + lazy val Scala: String = "2.13.12" lazy val Scala12: String = "2.12.18" - lazy val Spark: String = "3.5.3" + lazy val Spark: String = "3.5.3" - lazy val Cats: String = "2.12.0" + lazy val Cats: String = "2.12.0" lazy val CatsEffect: String = "3.5.5" lazy val Pureconfig: String = "0.17.4" - lazy val ScalaTest: String = "3.2.9" - lazy val Postgres: String = "42.7.4" + lazy val ScalaTest: String = "3.2.9" + lazy val Postgres: String = "42.7.4" - lazy val Circe = "0.13.0" - lazy val Tofu = "0.13.0" + lazy val Circe = "0.13.0" + lazy val Tofu = "0.13.0" lazy val Estatico: String = "0.4.4" - lazy val Fs2: String = "3.9.3" + lazy val Fs2: String = "3.9.3" + + lazy val HoldenVersion: String = "3.5.3_2.0.1" lazy val Vault: String = "6.2.0" -} \ No newline at end of file +} diff --git a/semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalAsset.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala similarity index 89% rename from semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalAsset.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala index 0b63858..b7aa885 100644 --- a/semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalAsset.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala @@ -22,13 +22,11 @@ * SOFTWARE. */ -package com.eff3ct.teckle.semantic +package com.eff3ct.teckel.semantic.core -import com.eff3ct.teckle.model._ +import com.eff3ct.teckel.model._ -trait EvalAsset[T] { - def eval(context: Context[Asset], asset: Asset): T -} +trait EvalAsset[+T] extends Semantic[Asset, Context[Asset], T] object EvalAsset { def apply[T: EvalAsset]: EvalAsset[T] = diff --git a/semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalContext.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalContext.scala similarity index 93% rename from semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalContext.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalContext.scala index bc53e8c..da78ebc 100644 --- a/semantic/src/main/scala/com/eff3ct/teckle/semantic/EvalContext.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalContext.scala @@ -22,9 +22,9 @@ * SOFTWARE. */ -package com.eff3ct.teckle.semantic +package com.eff3ct.teckel.semantic.core -import com.eff3ct.teckle.model.{Asset, Context} +import com.eff3ct.teckel.model.{Asset, Context} trait EvalContext[T] { def eval(context: Context[Asset]): T diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala new file mode 100644 index 0000000..364cf8d --- /dev/null +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala @@ -0,0 +1,38 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic.core + +trait Semantic[-S, -I, +O] { + def eval(input: I, source: S): O +} + +object Semantic { + + def apply[S, I, O](implicit S: Semantic[S, I, O]): Semantic[S, I, O] = S + + def pure[S, I, O](f: S => O): Semantic[S, I, O] = (_: I, source: S) => f(source) + + def any[S, O](f: S)(implicit S: Semantic[S, Any, O]): O = S.eval((), f) +} diff --git a/semantic/src/main/scala/com/eff3ct/teckle/semantic/evaluation.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala similarity index 67% rename from semantic/src/main/scala/com/eff3ct/teckle/semantic/evaluation.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala index 94b11b8..1d09568 100644 --- a/semantic/src/main/scala/com/eff3ct/teckle/semantic/evaluation.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala @@ -22,29 +22,30 @@ * SOFTWARE. */ -package com.eff3ct.teckle.semantic +package com.eff3ct.teckel.semantic -import com.eff3ct.teckle.model.Source.{Input, Output} -import com.eff3ct.teckle.model.{Asset, Context} -import org.apache.spark.sql.{DataFrame, SparkSession} - -import scala.annotation.tailrec +import com.eff3ct.teckel.model.Source._ +import com.eff3ct.teckel.model._ +import com.eff3ct.teckel.semantic.core._ +import com.eff3ct.teckel.semantic.sources.Debug +import com.eff3ct.teckel.semantic.sources.Debug._ +import org.apache.spark.sql._ object evaluation { - implicit def evalAssetDebug(implicit S: SparkSession): EvalAsset[DataFrame] = + implicit def debug(implicit S: SparkSession): EvalAsset[DataFrame] = new EvalAsset[DataFrame] { - @tailrec - override def eval(context: Context[Asset], asset: Asset): DataFrame = + override def eval(context: Context[Asset], asset: Asset): DataFrame = { asset.source match { - case Input(format, options, ref) => - S.read.format(format).options(options).load(ref) - case Output(assetRef, _, _, _, _) => - eval(context, context(assetRef)) + case s: Input => Semantic.any[Input, DataFrame](s) + case s: Output => Debug[Output].debug(eval(context, context(s.assetRef)), s) + case s: Transformation => + Debug[Transformation].debug(eval(context, context(s.assetRef)), s) } + } } - implicit def evalContextDebug[T: EvalAsset]: EvalContext[Context[T]] = + implicit def debugContext[T: EvalAsset]: EvalContext[Context[T]] = (context: Context[Asset]) => context.map { case (ref, asset) => ref -> EvalAsset[T].eval(context, asset) diff --git a/semantic/src/main/scala/com/eff3ct/teckle/semantic/execution.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala similarity index 65% rename from semantic/src/main/scala/com/eff3ct/teckle/semantic/execution.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala index eb7a582..9a35562 100644 --- a/semantic/src/main/scala/com/eff3ct/teckle/semantic/execution.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala @@ -22,28 +22,31 @@ * SOFTWARE. */ -package com.eff3ct.teckle.semantic -import com.eff3ct.teckle.model.Source.Output -import com.eff3ct.teckle.model.{Asset, Context} -import org.apache.spark.sql.DataFrame +package com.eff3ct.teckel.semantic +import com.eff3ct.teckel.model.Source.Output +import com.eff3ct.teckel.model._ +import com.eff3ct.teckel.semantic.core._ +import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.semantic.sources.Exec +import com.eff3ct.teckel.semantic.sources.Exec._ +import org.apache.spark.sql._ object execution { - implicit def evalAssetExecution(implicit E: EvalAsset[DataFrame]): EvalAsset[Unit] = - (context: Context[Asset], asset: Asset) => + implicit def exec(implicit S: SparkSession): EvalAsset[Unit] = + (context: Context[Asset], asset: Asset) => { asset.source match { - case Output(_, format, mode, options, ref) => - EvalAsset[DataFrame] - .eval(context, asset) - .write - .format(format) - .mode(mode) - .options(options) - .save(ref) + case o: Output => + val EA: EvalAsset[DataFrame] = debug + Exec[Output].eval( + EA.eval(context, asset), + o + ) // TODO: Check if the asset is already evaluated case _ => () } + } - implicit def evalContextExecution(implicit E: EvalAsset[Unit]): EvalContext[Unit] = + implicit def execContext(implicit E: EvalAsset[Unit]): EvalContext[Unit] = (context: Context[Asset]) => context.foreach { case (ref, asset @ Asset(_, _: Output)) => diff --git a/semantic/src/main/scala/com/eff3ct/teckle/semantic/package.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/package.scala similarity index 86% rename from semantic/src/main/scala/com/eff3ct/teckle/semantic/package.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/package.scala index e02dc5f..ab84dd7 100644 --- a/semantic/src/main/scala/com/eff3ct/teckle/semantic/package.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/package.scala @@ -22,15 +22,21 @@ * SOFTWARE. */ -package com.eff3ct.teckle +package com.eff3ct.teckel -import com.eff3ct.teckle.model.{Asset, Context} +import com.eff3ct.teckel.model.{Asset, Context} +import com.eff3ct.teckel.semantic.core._ package object semantic { + type SemanticA[S, O] = Semantic[S, Any, O] + + /** Evaluate the context */ def eval[T: EvalContext](context: Context[Asset]): T = EvalContext[T].eval(context) + /** Evaluate an asset */ def eval[T: EvalAsset](context: Context[Asset], asset: Asset): T = EvalAsset[T].eval(context, asset) + } diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala new file mode 100644 index 0000000..a79406d --- /dev/null +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala @@ -0,0 +1,81 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic.sources + +import cats.data.NonEmptyList +import com.eff3ct.teckel.model.Source._ +import com.eff3ct.teckel.semantic.SemanticA +import com.eff3ct.teckel.semantic.core.Semantic +import org.apache.spark.sql.functions.expr +import org.apache.spark.sql.{DataFrame, RelationalGroupedDataset, SparkSession} + +trait Debug[S] extends Semantic[S, DataFrame, DataFrame] { + def debug(df: DataFrame, source: S): DataFrame = eval(df, source) +} + +object Debug { + def apply[S: Debug]: Debug[S] = implicitly[Debug[S]] + + implicit def input[S <: Input](implicit S: SparkSession): SemanticA[S, DataFrame] = + Semantic.pure((source: S) => + S.read.format(source.format).options(source.options).load(source.sourceRef) + ) + + implicit val output: Debug[Output] = + (df, _) => df + + /** Transformation */ + implicit val transformation: Debug[Transformation] = + (df, source) => + source match { + case s: Select => Debug[Select].debug(df, s) + case s: Where => Debug[Where].debug(df, s) + case s: GroupBy => Debug[GroupBy].debug(df, s) + case s: OrderBy => Debug[OrderBy].debug(df, s) + } + + /** Select */ + implicit val select: Debug[Select] = + (df, source) => df.select(source.columns.toList.map(df(_)): _*) + + /** Where */ + implicit val whereS: Debug[Where] = + (df, source) => df.where(source.condition) + + /** GroupBy */ + implicit val groupByS: Debug[GroupBy] = + (df, source) => { + val relDF: RelationalGroupedDataset = df.groupBy(source.by.toList.map(df(_)): _*) + source.aggregate match { + case NonEmptyList(a, Nil) => relDF.agg(expr(a)) + case NonEmptyList(a, tail) => relDF.agg(expr(a), tail.map(expr): _*) + } + } + + /** OrderBy */ + // TODO: implement the asc/desc order + implicit val orderByS: Debug[OrderBy] = + (df, source) => df.orderBy(source.by.toList.map(df(_)): _*) +} diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala new file mode 100644 index 0000000..6eb81c2 --- /dev/null +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala @@ -0,0 +1,47 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic.sources + +import com.eff3ct.teckel.model.Source.Output +import com.eff3ct.teckel.semantic.core.Semantic +import org.apache.spark.sql.DataFrame + +trait Exec[-S] extends Semantic[S, DataFrame, Unit] + +object Exec { + def apply[S: Exec]: Exec[S] = implicitly[Exec[S]] + + implicit val execOutput: Exec[Output] = + (df, source) => + source match { + case Output(_, format, mode, options, ref) => + df.write + .format(format) + .mode(mode) + .options(options) + .save(ref) + } + +} diff --git a/model/src/main/scala/com/eff3ct/teckle/model/Source.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala similarity index 78% rename from model/src/main/scala/com/eff3ct/teckle/model/Source.scala rename to semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala index 598c4da..05133b5 100644 --- a/model/src/main/scala/com/eff3ct/teckle/model/Source.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala @@ -22,18 +22,13 @@ * SOFTWARE. */ -package com.eff3ct.teckle.model +package com.eff3ct.teckel.semantic -sealed trait Source +import org.apache.spark.sql.DataFrame -object Source { - case class Input(format: Format, options: Options, sourceRef: SourceRef) extends Source +package object sources { - case class Output( - assetRef: AssetRef, - format: Format, - mode: Mode, - options: Options, - sourceRef: SourceRef - ) extends Source + def debug[S: Debug]: (DataFrame, S) => DataFrame = Debug[S].debug + + def exec[S: Exec]: (DataFrame, S) => Unit = Exec[S].eval } diff --git a/semantic/src/test/resources/data/csv/example.csv b/semantic/src/test/resources/data/csv/example.csv new file mode 100644 index 0000000..4dd58b4 --- /dev/null +++ b/semantic/src/test/resources/data/csv/example.csv @@ -0,0 +1,29 @@ +Date|Symbol|Adj Close|Close|High|Low|Open|Volume +2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 +2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 +2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 +2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 +2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 +2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 +2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 +2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 +2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 +2024-11-18|ZTS|176.4199981689453|176.4199981689453|177.1999969482422|173.66000366210938|174.1300048828125|3172900.0 +2024-11-19|ZTS|175.55999755859375|175.55999755859375|176.5|173.24000549316406|174.8800048828125|2208300.0 +2024-11-20|ZTS|175.6699981689453|175.6699981689453|177.41000366210938|173.8300018310547|176.4199981689453|2187300.0 +2024-11-21|ZTS|176.7100067138672|176.7100067138672|177.66000366210938|174.5500030517578|175.6999969482422|2019500.0 +2024-11-22|ZTS|176.9600067138672|176.9600067138672|178.07000732421875|176.27999877929688|176.35000610351562|1854600.0 +2024-11-25|ZTS|178.7100067138672|178.7100067138672|178.8000030517578|176.14999389648438|177.0|4558300.0 +2024-11-26|ZTS|175.6999969482422|175.6999969482422|178.64999389648438|174.83999633789062|178.4499969482422|2539600.0 +2024-11-27|ZTS|176.74000549316406|176.74000549316406|179.27000427246094|175.0|175.27000427246094|2315800.0 +2024-11-29|ZTS|175.25|175.25|177.80999755859375|175.24000549316406|176.92999267578125|1543400.0 +2024-12-02|ZTS|176.80999755859375|176.80999755859375|176.91000366210938|173.72999572753906|175.77999877929688|2391500.0 +2024-12-03|ZTS|176.94000244140625|176.94000244140625|181.39999389648438|176.55999755859375|176.7100067138672|2679000.0 +2024-12-04|ZTS|175.32000732421875|175.32000732421875|178.5|174.5399932861328|174.60000610351562|2687000.0 +2024-12-05|ZTS|174.77000427246094|174.77000427246094|176.52999877929688|173.72000122070312|175.27000427246094|2442000.0 +2024-12-06|ZTS|176.4600067138672|176.4600067138672|177.5500030517578|174.41000366210938|174.77000427246094|2551200.0 +2024-12-09|ZTS|178.14999389648438|178.14999389648438|179.77999877929688|175.0800018310547|175.8800048828125|2387300.0 +2024-12-10|ZTS|176.7100067138672|176.7100067138672|178.50999450683594|176.1999969482422|177.8800048828125|1678200.0 +2024-12-11|ZTS|177.1699981689453|177.1699981689453|178.3800048828125|175.80999755859375|176.25999450683594|1782400.0 +2024-12-12|ZTS|178.83999633789062|178.83999633789062|179.6999969482422|176.55999755859375|176.9499969482422|1936000.0 +2024-12-13|ZTS|178.17999267578125|178.17999267578125|181.85000610351562|176.6300048828125|178.97999572753906|1650300.0 diff --git a/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala b/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala new file mode 100644 index 0000000..7d56bc7 --- /dev/null +++ b/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala @@ -0,0 +1,111 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic + +import cats.data.NonEmptyList +import com.eff3ct.teckel.model.Source._ +import com.eff3ct.teckel.semantic.core.Semantic +import com.eff3ct.teckel.semantic.sources.Debug +import com.eff3ct.teckel.semantic.sources.Debug._ +import com.holdenkarau.spark.testing._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +class DebugSource + extends AnyFlatSpecLike + with Matchers + with DataFrameSuiteBase + with SparkTestUtils { + + object Resources { + val input: DataFrame = spark.read + .format("csv") + .option("header", "true") + .option("sep", "|") + .load("src/test/resources/data/csv/example.csv") + } + + object Sources { + + val input: Input = + Input("csv", Map("header" -> "true", "sep" -> "|"), "src/test/resources/data/csv/example.csv") + + val output: Output = + Output("table1", "parquet", "overwrite", Map(), "src/test/resources/data/parquet/example") + + val select: Select = Select("table1", NonEmptyList.of("Symbol", "Date")) + + val where: Where = Where("table1", "Date > '2024-12-12'") + + val groupBy: GroupBy = GroupBy( + "table1", + NonEmptyList.of("Symbol"), + NonEmptyList.of( + "sum(`Adj Close`) as TotalClose", + "max(High) as Highest", + "min(Low) as Lowest" + ) + ) + + val orderBy: OrderBy = OrderBy("table1", NonEmptyList.of("High"), Some("Asc")) + + } + "DebugSource" should "debug an input source" in { + Semantic.any[Input, DataFrame](Sources.input) :===: Resources.input + } + + it should "debug an output source" in { + Debug[Output].debug(Resources.input, Sources.output) :===: Resources.input + } + + it should "debug a select transformation" in { + Debug[Select].debug(Resources.input, Sources.select) :===: + Resources.input.select("Symbol", "Date") + } + + it should "debug a where transformation" in { + Debug[Where].debug(Resources.input, Sources.where) :===: + Resources.input.where("Date > '2024-12-12'") + } + + it should "debug a groupBy transformation" in { + Debug[GroupBy].debug(Resources.input, Sources.groupBy) :===: + Resources.input + .groupBy("Symbol") + .agg( + sum("Adj Close") as "TotalClose", + max("High") as "Highest", + min("Low") as "Lowest" + ) + } + + it should "debug an orderBy transformation" in { + Debug[OrderBy].debug(Resources.input, Sources.orderBy) :===: + Resources.input.orderBy("High") + } + +} diff --git a/semantic/src/test/scala/com/eff3ct/teckel/semantic/SparkTestUtils.scala b/semantic/src/test/scala/com/eff3ct/teckel/semantic/SparkTestUtils.scala new file mode 100644 index 0000000..c35996f --- /dev/null +++ b/semantic/src/test/scala/com/eff3ct/teckel/semantic/SparkTestUtils.scala @@ -0,0 +1,39 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.{DataFrame, SparkSession} + +trait SparkTestUtils { + self: DataFrameSuiteBase => + + implicit lazy val sp: SparkSession = self.spark + + implicit class DataFrameAssert(df: DataFrame) { + def :===:(expected: DataFrame): Unit = + assertDataFrameEquals(df, expected) + } +} diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/Serializer.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/Serializer.scala similarity index 97% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/Serializer.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/Serializer.scala index 39bb14c..b064057 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/Serializer.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/Serializer.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle.serializer +package com.eff3ct.teckel.serializer import io.circe.Error diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/alternative.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/alternative.scala similarity index 97% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/alternative.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/alternative.scala index a5678e9..2b67407 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/alternative.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/alternative.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle.serializer +package com.eff3ct.teckel.serializer import io.circe._ import io.circe.syntax._ diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala new file mode 100644 index 0000000..5140784 --- /dev/null +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala @@ -0,0 +1,48 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.model + +import cats.data.NonEmptyList +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import derevo.circe.magnolia.{decoder, encoder} +import derevo.derive + +object etl { + + @derive(encoder, decoder) + case class ETL( + input: NonEmptyList[Input], + transformation: Option[NonEmptyList[Transformation]], + output: NonEmptyList[Output] + ) + + object ETL { + def apply(input: NonEmptyList[Input], output: NonEmptyList[Output]): ETL = + ETL(input, None, output) + } + +} diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala new file mode 100644 index 0000000..f6b933b --- /dev/null +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala @@ -0,0 +1,56 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.model + +import com.eff3ct.teckel.serializer.types.PrimitiveType +import com.eff3ct.teckel.serializer.types.implicits._ +import derevo.circe.magnolia.encoder +import derevo.derive +import io.circe.{Decoder, HCursor} + +object input { + + @derive(encoder) + case class Input( + name: String, + format: String, + path: String, + options: Map[String, PrimitiveType] + ) + + implicit val decodeInput: Decoder[Input] = (c: HCursor) => { + for { + name <- c.downField("name").as[String] + format <- c.downField("format").as[String] + path <- c.downField("path").as[String] + options <- c + .downField("options") + .as[Map[String, PrimitiveType]] + .orElse(Right(Map.empty[String, PrimitiveType])) + + } yield Input(name, format, path, options) + } + +} diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala new file mode 100644 index 0000000..49734ba --- /dev/null +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala @@ -0,0 +1,65 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.model + +import cats.data.NonEmptyList +import cats.implicits._ +import derevo.circe.magnolia.{decoder, encoder} +import derevo.derive +import io.circe.syntax._ +import io.circe.{Decoder, Encoder} + +object operations { + + sealed trait Operation + + implicit val encodeEvent: Encoder[Operation] = + Encoder.instance { + case s: SelectOp => s.asJson + case w: WhereOp => w.asJson + case g: GroupByOp => g.asJson + case o: OrderByOp => o.asJson + } + + implicit val decodeEvent: Decoder[Operation] = + List[Decoder[Operation]]( + Decoder[SelectOp].widen, + Decoder[WhereOp].widen, + Decoder[GroupByOp].widen, + Decoder[OrderByOp].widen + ).reduceLeft(_ or _) + + @derive(encoder, decoder) + case class SelectOp(from: String, columns: NonEmptyList[String]) extends Operation + @derive(encoder, decoder) + case class WhereOp(from: String, filter: String) extends Operation + @derive(encoder, decoder) + case class GroupByOp(from: String, by: NonEmptyList[String], agg: NonEmptyList[String]) + extends Operation + @derive(encoder, decoder) + case class OrderByOp(from: String, by: NonEmptyList[String], order: Option[String]) + extends Operation + +} diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/model.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala similarity index 68% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/model.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala index 98b6c1c..7fb2b39 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/model.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala @@ -22,23 +22,15 @@ * SOFTWARE. */ -package com.eff3ct.teckle.serializer +package com.eff3ct.teckel.serializer.model -import derevo.circe.magnolia.{decoder, encoder} +import com.eff3ct.teckel.serializer.types.PrimitiveType +import com.eff3ct.teckel.serializer.types.implicits._ +import derevo.circe.magnolia.encoder import derevo.derive import io.circe.{Decoder, HCursor} -import com.eff3ct.teckle.serializer.types.PrimitiveType -import com.eff3ct.teckle.serializer.types.implicits._ -object model { - - @derive(encoder) - case class Input( - name: String, - format: String, - path: String, - options: Map[String, PrimitiveType] - ) +object output { @derive(encoder) case class Output( @@ -49,9 +41,6 @@ object model { options: Map[String, PrimitiveType] ) - @derive(encoder, decoder) - case class ETL(input: List[Input], output: List[Output]) - /** Decoders */ implicit val decodeOutput: Decoder[Output] = (c: HCursor) => { for { @@ -67,17 +56,4 @@ object model { } yield Output(name, format, mode, path, options) } - implicit val decodeInput: Decoder[Input] = (c: HCursor) => { - for { - name <- c.downField("name").as[String] - format <- c.downField("format").as[String] - path <- c.downField("path").as[String] - options <- c - .downField("options") - .as[Map[String, PrimitiveType]] - .orElse(Right(Map.empty[String, PrimitiveType])) - - } yield Input(name, format, path, options) - } - } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala new file mode 100644 index 0000000..d6d614e --- /dev/null +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala @@ -0,0 +1,61 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.model + +import cats.implicits._ +import com.eff3ct.teckel.serializer.model.operations._ +import derevo.circe.magnolia.{decoder, encoder} +import derevo.derive +import io.circe.syntax._ +import io.circe.{Decoder, Encoder} + +object transformation { + sealed trait Transformation + + implicit val encodeEvent: Encoder[Transformation] = + Encoder.instance { + case s: Select => s.asJson + case w: Where => w.asJson + case g: GroupBy => g.asJson + case o: OrderBy => o.asJson + } + + implicit val decodeEvent: Decoder[Transformation] = + List[Decoder[Transformation]]( + Decoder[Select].widen, + Decoder[Where].widen, + Decoder[GroupBy].widen, + Decoder[OrderBy].widen + ).reduceLeft(_ or _) + + @derive(encoder, decoder) + case class Select(name: String, select: SelectOp) extends Transformation + @derive(encoder, decoder) + case class Where(name: String, where: WhereOp) extends Transformation + @derive(encoder, decoder) + case class GroupBy(name: String, group: GroupByOp) extends Transformation + @derive(encoder, decoder) + case class OrderBy(name: String, order: OrderByOp) extends Transformation +} diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/package.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala similarity index 97% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/package.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala index 064a0a9..51b1be1 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/package.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle +package com.eff3ct.teckel import io.circe._ diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/types/PrimitiveType.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/types/PrimitiveType.scala similarity index 97% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/types/PrimitiveType.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/types/PrimitiveType.scala index b183719..83ce754 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/types/PrimitiveType.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/types/PrimitiveType.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckle.serializer.types +package com.eff3ct.teckel.serializer.types sealed trait PrimitiveType extends Serializable with Product diff --git a/serializer/src/main/scala/com/eff3ct/teckle/serializer/types/implicits.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/types/implicits.scala similarity index 96% rename from serializer/src/main/scala/com/eff3ct/teckle/serializer/types/implicits.scala rename to serializer/src/main/scala/com/eff3ct/teckel/serializer/types/implicits.scala index 77e7255..5488cc7 100644 --- a/serializer/src/main/scala/com/eff3ct/teckle/serializer/types/implicits.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/types/implicits.scala @@ -22,11 +22,11 @@ * SOFTWARE. */ -package com.eff3ct.teckle.serializer.types +package com.eff3ct.teckel.serializer.types import cats.Show import io.circe.{Decoder, Encoder, Json} -import com.eff3ct.teckle.serializer.types.PrimitiveType._ +import com.eff3ct.teckel.serializer.types.PrimitiveType._ object implicits { diff --git a/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala b/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala new file mode 100644 index 0000000..1f7ad72 --- /dev/null +++ b/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala @@ -0,0 +1,101 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.transform + +import cats.Show +import cats.data.NonEmptyList +import com.eff3ct.teckel.model.{Asset, Context, Source} +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import com.eff3ct.teckel.serializer.types.PrimitiveType +import com.eff3ct.teckel.serializer.types.implicits._ + +object Rewrite { + + def rewrite(options: Map[String, PrimitiveType]): Map[String, String] = + options.map { case (k, v) => k -> Show[PrimitiveType].show(v) } + + def rewrite(item: Input): Asset = + Asset(item.name, Source.Input(item.format, rewrite(item.options), item.path)) + + def rewrite(item: Output): Asset = + Asset( + s"output_${item.name}", + Source.Output(item.name, item.format, item.mode, rewrite(item.options), item.path) + ) + + def rewriteOp(item: Select): Asset = + Asset(item.name, Source.Select(item.select.from, item.select.columns)) + + def rewriteOp(item: Where): Asset = + Asset(item.name, Source.Where(item.where.from, item.where.filter)) + + def rewriteOp(item: GroupBy): Asset = + Asset(item.name, Source.GroupBy(item.group.from, item.group.by, item.group.agg)) + + def rewriteOp(item: OrderBy): Asset = + Asset(item.name, Source.OrderBy(item.order.from, item.order.by, item.order.order)) + + def rewrite(item: Transformation): Asset = + item match { + case s: Select => rewriteOp(s) + case s: Where => rewriteOp(s) + case s: GroupBy => rewriteOp(s) + case s: OrderBy => rewriteOp(s) + } + + def icontext(item: NonEmptyList[Input]): Context[Asset] = + item + .map { i => + val asset: Asset = rewrite(i) + asset.assetRef -> asset + } + .toList + .toMap + + def ocontext(item: NonEmptyList[Output]): Context[Asset] = + item + .map { o => + val asset: Asset = rewrite(o) + asset.assetRef -> asset + } + .toList + .toMap + + def tcontext(item: Option[NonEmptyList[Transformation]]): Context[Asset] = + (for { + transformation <- item + context = transformation.map { t => + val asset: Asset = rewrite(t) + asset.assetRef -> asset + } + } yield context.toList.toMap).getOrElse(Map()) + + def rewrite(item: ETL): Context[Asset] = + icontext(item.input) ++ ocontext(item.output) ++ tcontext(item.transformation) + +} diff --git a/serializer/src/main/scala/com/eff3ct/teckle/transform/Rewrite.scala b/serializer/src/main/scala/com/eff3ct/teckle/transform/Rewrite.scala deleted file mode 100644 index d3b894b..0000000 --- a/serializer/src/main/scala/com/eff3ct/teckle/transform/Rewrite.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2024 Rafael Fernandez - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.eff3ct.teckle.transform - -import cats.Show -import com.eff3ct.teckle.model.Source.{Input => I, Output => O} -import com.eff3ct.teckle.model.{Asset, Context} -import com.eff3ct.teckle.serializer.model._ -import com.eff3ct.teckle.serializer.types.PrimitiveType -import com.eff3ct.teckle.serializer.types.implicits._ - -object Rewrite { - - def rewrite(options: Map[String, PrimitiveType]): Map[String, String] = - options.map { case (k, v) => k -> Show[PrimitiveType].show(v) } - - def rewrite(item: Input): Asset = - Asset(item.name, I(item.format, rewrite(item.options), item.path)) - - def rewrite(item: Output): Asset = - Asset( - s"output_${item.name}", - O(item.name, item.format, item.mode, rewrite(item.options), item.path) - ) - - def rewrite(item: ETL): Context[Asset] = - (item.input.map { i => - val asset: Asset = rewrite(i) - asset.assetRef -> asset - } ::: - item.output.map { i => - val asset: Asset = rewrite(i) - asset.assetRef -> asset - }).toMap - -} diff --git a/serializer/src/test/resources/complex.json b/serializer/src/test/resources/complex.json new file mode 100644 index 0000000..c298436 --- /dev/null +++ b/serializer/src/test/resources/complex.json @@ -0,0 +1,53 @@ +{ + "input": [ + { + "name": "table1", + "format": "csv", + "path": "data/csv/example.csv", + "options": { + "header": true, + "sep": "|" + } + } + ], + "transformation": [ + { + "name": "selectTable1", + "select": { + "from": "table1", + "columns": ["col1", "col2"] + } + }, + { + "name": "whereTable1", + "where": { + "from": "selectTable1", + "filter": "col1 > 10" + } + }, + { + "name": "groupByTable1", + "group": { + "from": "whereTable1", + "by": ["col1", "col2"], + "agg": ["sum(col1)", "max(col2)"] + } + }, + { + "name": "orderByTable1", + "order": { + "from": "groupByTable1", + "by": ["col1", "col2"], + "order": "Desc" + } + } + ], + "output": [ + { + "name": "orderByTable1", + "format": "parquet", + "mode": "overwrite", + "path": "data/parquet/example" + } + ] +} \ No newline at end of file diff --git a/serializer/src/test/resources/complex.yaml b/serializer/src/test/resources/complex.yaml new file mode 100644 index 0000000..e338573 --- /dev/null +++ b/serializer/src/test/resources/complex.yaml @@ -0,0 +1,43 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + + +transformation: + - name: selectTable1 + select: + from: table1 + columns: + - col1 + - col2 + - name: whereTable1 + where: + from: selectTable1 + filter: 'col1 > 10' + - name: groupByTable1 + group: + from: whereTable1 + by: + - col1 + - col2 + agg: + - sum(col1) + - max(col2) + - name: orderByTable1 + order: + from: groupByTable1 + by: + - col1 + - col2 + order: Desc + + +output: + - name: orderByTable1 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/serializer/src/test/scala/com/eff3ct/teckel/serializer/DefaultSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckel/serializer/DefaultSerializerSpec.scala new file mode 100644 index 0000000..8da4900 --- /dev/null +++ b/serializer/src/test/scala/com/eff3ct/teckel/serializer/DefaultSerializerSpec.scala @@ -0,0 +1,276 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer + +import cats.data.NonEmptyList +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import com.eff3ct.teckel.serializer.model.operations._ +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.serializer.types.PrimitiveType._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +class DefaultSerializerSpec extends AnyFlatSpecLike with Matchers { + + object Yaml { + + val input: String = """name: table1 + |format: csv + |path: '/path/path1/file.csv' + |options: + | header: true + | sep: '|'""".stripMargin + + val output: String = """name: table1 + |format: parquet + |mode: overwrite + |path: '/path/path1'""".stripMargin + + val select: String = + """ + |name: selectTable1 + |select: + | from: table1 + | columns: + | - col1 + | - col2 + |""".stripMargin + + val where: String = + """ + |name: whereTable1 + |where: + | from: table1 + | filter: 'col1 > 10' + |""".stripMargin + + val groupBy: String = + """ + |name: groupByTable1 + |group: + | from: table1 + | by: + | - col1 + | - col2 + | agg: + | - sum(col1) + | - max(col2) + |""".stripMargin + + val orderBy: String = + """ + |name: orderByTable1 + |order: + | from: table1 + | by: + | - col1 + | - col2 + | order: Desc + |""".stripMargin + + val etl: String = """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'""".stripMargin + + val complexETL: String = """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |transformation: + | - name: selectTable1 + | select: + | from: table1 + | columns: + | - col1 + | - col2 + | - name: whereTable1 + | where: + | from: selectTable1 + | filter: 'col1 > 10' + | - name: groupByTable1 + | group: + | from: whereTable1 + | by: + | - col1 + | - col2 + | agg: + | - sum(col1) + | - max(col2) + | - name: orderByTable1 + | order: + | from: groupByTable1 + | by: + | - col1 + | - col2 + | order: Desc + | + | + |output: + | - name: orderByTable1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example' + |""".stripMargin + + } + + object Model { + + val input: Input = + Input( + "table1", + "csv", + "/path/path1/file.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + + val output: Output = + Output("table1", "parquet", "overwrite", "/path/path1", Map()) + + val select: Select = + Select( + "selectTable1", + SelectOp("table1", NonEmptyList.of("col1", "col2")) + ) + + val where: Where = + Where( + "whereTable1", + WhereOp("table1", "col1 > 10") + ) + + val groupBy: GroupBy = + GroupBy( + "groupByTable1", + GroupByOp( + "table1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ) + + val orderBy: OrderBy = + OrderBy( + "orderByTable1", + OrderByOp("table1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + + val etl: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + NonEmptyList.of(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) + ) + + val complexETL: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + Some( + NonEmptyList.of( + Select("selectTable1", SelectOp("table1", NonEmptyList.of("col1", "col2"))), + Where("whereTable1", WhereOp("selectTable1", "col1 > 10")), + GroupBy( + "groupByTable1", + GroupByOp( + "whereTable1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ), + OrderBy( + "orderByTable1", + OrderByOp("groupByTable1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + ) + ), + NonEmptyList.of( + Output("orderByTable1", "parquet", "overwrite", "data/parquet/example", Map()) + ) + ) + } + + "DefaultSerializer" should "decode into an Input" in { + Serializer[Input].decode(Yaml.input) shouldBe Right(Model.input) + } + + it should "decode into an Output" in { + Serializer[Output].decode(Yaml.output) shouldBe Right(Model.output) + } + + it should "decode into a Select" in { + Serializer[Select].decode(Yaml.select) shouldBe Right(Model.select) + } + + it should "decode into a Where" in { + Serializer[Where].decode(Yaml.where) shouldBe Right(Model.where) + } + + it should "decode into a GroupBy" in { + Serializer[GroupBy].decode(Yaml.groupBy) shouldBe Right(Model.groupBy) + } + + it should "decode into a OrderBy" in { + Serializer[OrderBy].decode(Yaml.orderBy) shouldBe Right(Model.orderBy) + } + + it should "decode into a simple ETL" in { + Serializer[ETL].decode(Yaml.etl) shouldBe Right(Model.etl) + } + + it should "decode into a complex ETL" in { + Serializer[ETL].decode(Yaml.complexETL) shouldBe Right(Model.complexETL) + } +} diff --git a/serializer/src/test/scala/com/eff3ct/teckel/serializer/ExampleSpec.scala b/serializer/src/test/scala/com/eff3ct/teckel/serializer/ExampleSpec.scala new file mode 100644 index 0000000..894985c --- /dev/null +++ b/serializer/src/test/scala/com/eff3ct/teckel/serializer/ExampleSpec.scala @@ -0,0 +1,133 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer + +import cats.data.NonEmptyList +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.operations._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import com.eff3ct.teckel.serializer.types.PrimitiveType._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +import scala.io.Source + +class ExampleSpec extends AnyFlatSpecLike with Matchers { + + object Model { + val simple: ETL = ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + NonEmptyList.of(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) + ) + + val complexETL: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + Some( + NonEmptyList.of( + Select("selectTable1", SelectOp("table1", NonEmptyList.of("col1", "col2"))), + Where("whereTable1", WhereOp("selectTable1", "col1 > 10")), + GroupBy( + "groupByTable1", + GroupByOp( + "whereTable1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ), + OrderBy( + "orderByTable1", + OrderByOp("groupByTable1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + ) + ), + NonEmptyList.of( + Output("orderByTable1", "parquet", "overwrite", "data/parquet/example", Map()) + ) + ) + } + + /** Default */ + "ExampleSpec" should "work correctly using a simple yaml with default serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/simple.yaml").mkString + ) shouldBe + Right(Model.simple) + } + + it should "work correctly using a complex yaml with default serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/complex.yaml").mkString + ) shouldBe + Right(Model.complexETL) + } + + /** Yaml */ + it should "work correctly using a simple yaml with yaml serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/simple.yaml").mkString + ) shouldBe + Right(Model.simple) + } + + it should "work correctly using a complex yaml with yaml serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/complex.yaml").mkString + ) shouldBe + Right(Model.complexETL) + } + + /** Json */ + it should "work correctly using a simple json with json serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/simple.json").mkString + ) shouldBe + Right(Model.simple) + } + + it should "work correctly using a complex json with json serializer" in { + Serializer[ETL].decode( + Source.fromFile("src/test/resources/complex.json").mkString + ) shouldBe + Right(Model.complexETL) + } + +} diff --git a/serializer/src/test/scala/com/eff3ct/teckel/serializer/jsonspec/JsonSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckel/serializer/jsonspec/JsonSerializerSpec.scala new file mode 100644 index 0000000..7961301 --- /dev/null +++ b/serializer/src/test/scala/com/eff3ct/teckel/serializer/jsonspec/JsonSerializerSpec.scala @@ -0,0 +1,292 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.jsonspec + +import cats.data.NonEmptyList +import com.eff3ct.teckel.serializer.Serializer +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.operations._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import com.eff3ct.teckel.serializer.types.PrimitiveType.{BooleanType, CharType} +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +class JsonSerializerSpec extends AnyFlatSpecLike with Matchers { + + import com.eff3ct.teckel.serializer.alternative.json + + object Json { + + val input: String = """{ + | "name": "table1", + | "format": "csv", + | "path": "/path/path1/file.csv", + | "options": { + | "header": true, + | "sep": "|" + | } + |}""".stripMargin + + val output: String = """{ + | "name": "table1", + | "format": "parquet", + | "mode": "overwrite", + | "path": "/path/path1" + |}""".stripMargin + + val select: String = """{ + | "name": "selectTable1", + | "select": { + | "from": "table1", + | "columns": ["col1", "col2"] + | } + |}""".stripMargin + + val where: String = """{ + | "name": "whereTable1", + | "where": { + | "from": "table1", + | "filter": "col1 > 10" + | } + |}""".stripMargin + + val groupBy: String = """{ + | "name": "groupByTable1", + | "group": { + | "from": "table1", + | "by": ["col1", "col2"], + | "agg": ["sum(col1)", "max(col2)"] + | } + |}""".stripMargin + + val orderBy: String = """{ + | "name": "orderByTable1", + | "order": { + | "from": "table1", + | "by": ["col1", "col2"], + | "order": "Desc" + | } + |}""".stripMargin + + val etl: String = """{ + | "input": [ + | { + | "name": "table1", + | "format": "csv", + | "path": "data/csv/example.csv", + | "options": { + | "header": true, + | "sep": "|" + | } + | } + | ], + | "output": [ + | { + | "name": "table1", + | "format": "parquet", + | "mode": "overwrite", + | "path": "data/parquet/example" + | } + | ] + |}""".stripMargin + + val complexETL: String = """{ + | "input": [ + | { + | "name": "table1", + | "format": "csv", + | "path": "data/csv/example.csv", + | "options": { + | "header": true, + | "sep": "|" + | } + | } + | ], + | "transformation": [ + | { + | "name": "selectTable1", + | "select": { + | "from": "table1", + | "columns": ["col1", "col2"] + | } + | }, + | { + | "name": "whereTable1", + | "where": { + | "from": "selectTable1", + | "filter": "col1 > 10" + | } + | }, + | { + | "name": "groupByTable1", + | "group": { + | "from": "whereTable1", + | "by": ["col1", "col2"], + | "agg": ["sum(col1)", "max(col2)"] + | } + | }, + | { + | "name": "orderByTable1", + | "order": { + | "from": "groupByTable1", + | "by": ["col1", "col2"], + | "order": "Desc" + | } + | } + | ], + | "output": [ + | { + | "name": "orderByTable1", + | "format": "parquet", + | "mode": "overwrite", + | "path": "data/parquet/example" + | } + | ] + |}""".stripMargin + + } + + object Model { + + val input: Input = + Input( + "table1", + "csv", + "/path/path1/file.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + + val output: Output = + Output("table1", "parquet", "overwrite", "/path/path1", Map()) + + val select: Select = + Select( + "selectTable1", + SelectOp("table1", NonEmptyList.of("col1", "col2")) + ) + + val where: Where = + Where( + "whereTable1", + WhereOp("table1", "col1 > 10") + ) + + val groupBy: GroupBy = + GroupBy( + "groupByTable1", + GroupByOp( + "table1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ) + + val orderBy: OrderBy = + OrderBy( + "orderByTable1", + OrderByOp("table1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + + val etl: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + NonEmptyList.of(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) + ) + + val complexETL: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + Some( + NonEmptyList.of( + Select("selectTable1", SelectOp("table1", NonEmptyList.of("col1", "col2"))), + Where("whereTable1", WhereOp("selectTable1", "col1 > 10")), + GroupBy( + "groupByTable1", + GroupByOp( + "whereTable1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ), + OrderBy( + "orderByTable1", + OrderByOp("groupByTable1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + ) + ), + NonEmptyList.of( + Output("orderByTable1", "parquet", "overwrite", "data/parquet/example", Map()) + ) + ) + } + + "DefaultSerializer" should "decode into an Input" in { + Serializer[Input].decode(Json.input) shouldBe Right(Model.input) + } + + it should "decode into an Output" in { + Serializer[Output].decode(Json.output) shouldBe Right(Model.output) + } + + it should "decode into a Select" in { + Serializer[Select].decode(Json.select) shouldBe Right(Model.select) + } + + it should "decode into a Where" in { + Serializer[Where].decode(Json.where) shouldBe Right(Model.where) + } + + it should "decode into a GroupBy" in { + Serializer[GroupBy].decode(Json.groupBy) shouldBe Right(Model.groupBy) + } + + it should "decode into a OrderBy" in { + Serializer[OrderBy].decode(Json.orderBy) shouldBe Right(Model.orderBy) + } + + it should "decode into a simple ETL" in { + Serializer[ETL].decode(Json.etl) shouldBe Right(Model.etl) + } + + it should "decode into a complex ETL" in { + Serializer[ETL].decode(Json.complexETL) shouldBe Right(Model.complexETL) + } +} diff --git a/serializer/src/test/scala/com/eff3ct/teckel/serializer/yamlspec/YamlSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckel/serializer/yamlspec/YamlSerializerSpec.scala new file mode 100644 index 0000000..eabc66a --- /dev/null +++ b/serializer/src/test/scala/com/eff3ct/teckel/serializer/yamlspec/YamlSerializerSpec.scala @@ -0,0 +1,279 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.serializer.yamlspec + +import cats.data.NonEmptyList +import com.eff3ct.teckel.serializer.Serializer +import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.output._ +import com.eff3ct.teckel.serializer.model.transformation._ +import com.eff3ct.teckel.serializer.model.operations._ +import com.eff3ct.teckel.serializer.model.etl._ +import com.eff3ct.teckel.serializer.types.PrimitiveType._ +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +class YamlSerializerSpec extends AnyFlatSpecLike with Matchers { + + import com.eff3ct.teckel.serializer.alternative.yaml + + object Yaml { + + val input: String = """name: table1 + |format: csv + |path: '/path/path1/file.csv' + |options: + | header: true + | sep: '|'""".stripMargin + + val output: String = """name: table1 + |format: parquet + |mode: overwrite + |path: '/path/path1'""".stripMargin + + val select: String = + """ + |name: selectTable1 + |select: + | from: table1 + | columns: + | - col1 + | - col2 + |""".stripMargin + + val where: String = + """ + |name: whereTable1 + |where: + | from: table1 + | filter: 'col1 > 10' + |""".stripMargin + + val groupBy: String = + """ + |name: groupByTable1 + |group: + | from: table1 + | by: + | - col1 + | - col2 + | agg: + | - sum(col1) + | - max(col2) + |""".stripMargin + + val orderBy: String = + """ + |name: orderByTable1 + |order: + | from: table1 + | by: + | - col1 + | - col2 + | order: Desc + |""".stripMargin + + val etl: String = """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'""".stripMargin + + val complexETL: String = """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |transformation: + | - name: selectTable1 + | select: + | from: table1 + | columns: + | - col1 + | - col2 + | - name: whereTable1 + | where: + | from: selectTable1 + | filter: 'col1 > 10' + | - name: groupByTable1 + | group: + | from: whereTable1 + | by: + | - col1 + | - col2 + | agg: + | - sum(col1) + | - max(col2) + | - name: orderByTable1 + | order: + | from: groupByTable1 + | by: + | - col1 + | - col2 + | order: Desc + | + | + |output: + | - name: orderByTable1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example' + |""".stripMargin + + } + + object Model { + + val input: Input = + Input( + "table1", + "csv", + "/path/path1/file.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + + val output: Output = + Output("table1", "parquet", "overwrite", "/path/path1", Map()) + + val select: Select = + Select( + "selectTable1", + SelectOp("table1", NonEmptyList.of("col1", "col2")) + ) + + val where: Where = + Where( + "whereTable1", + WhereOp("table1", "col1 > 10") + ) + + val groupBy: GroupBy = + GroupBy( + "groupByTable1", + GroupByOp( + "table1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ) + + val orderBy: OrderBy = + OrderBy( + "orderByTable1", + OrderByOp("table1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + + val etl: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + NonEmptyList.of(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) + ) + + val complexETL: ETL = + ETL( + NonEmptyList.of( + Input( + "table1", + "csv", + "data/csv/example.csv", + Map("header" -> BooleanType(true), "sep" -> CharType('|')) + ) + ), + Some( + NonEmptyList.of( + Select("selectTable1", SelectOp("table1", NonEmptyList.of("col1", "col2"))), + Where("whereTable1", WhereOp("selectTable1", "col1 > 10")), + GroupBy( + "groupByTable1", + GroupByOp( + "whereTable1", + NonEmptyList.of("col1", "col2"), + NonEmptyList.of("sum(col1)", "max(col2)") + ) + ), + OrderBy( + "orderByTable1", + OrderByOp("groupByTable1", NonEmptyList.of("col1", "col2"), Some("Desc")) + ) + ) + ), + NonEmptyList.of( + Output("orderByTable1", "parquet", "overwrite", "data/parquet/example", Map()) + ) + ) + } + + "YamlSerializer" should "decode into an Input" in { + Serializer[Input].decode(Yaml.input) shouldBe Right(Model.input) + } + + it should "decode into an Output" in { + Serializer[Output].decode(Yaml.output) shouldBe Right(Model.output) + } + + it should "decode into a Select" in { + Serializer[Select].decode(Yaml.select) shouldBe Right(Model.select) + } + + it should "decode into a Where" in { + Serializer[Where].decode(Yaml.where) shouldBe Right(Model.where) + } + + it should "decode into a GroupBy" in { + Serializer[GroupBy].decode(Yaml.groupBy) shouldBe Right(Model.groupBy) + } + + it should "decode into a OrderBy" in { + Serializer[OrderBy].decode(Yaml.orderBy) shouldBe Right(Model.orderBy) + } + + it should "decode into a simple ETL" in { + Serializer[ETL].decode(Yaml.etl) shouldBe Right(Model.etl) + } + + it should "decode into a complex ETL" in { + Serializer[ETL].decode(Yaml.complexETL) shouldBe Right(Model.complexETL) + } +} diff --git a/serializer/src/test/scala/com/eff3ct/teckle/serializer/DefaultSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckle/serializer/DefaultSerializerSpec.scala deleted file mode 100644 index 77e06ac..0000000 --- a/serializer/src/test/scala/com/eff3ct/teckle/serializer/DefaultSerializerSpec.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2024 Rafael Fernandez - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.eff3ct.teckle.serializer - -import com.eff3ct.teckle.serializer.model._ -import com.eff3ct.teckle.serializer.types.PrimitiveType._ -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers - -class DefaultSerializerSpec extends AnyFlatSpecLike with Matchers { - - object Yaml { - - val input: String = """name: table1 - |format: csv - |path: '/path/path1/file.csv' - |options: - | header: true - | sep: '|'""".stripMargin - - val output: String = """name: table1 - |format: parquet - |mode: overwrite - |path: '/path/path1'""".stripMargin - - val etl: String = """input: - | - name: table1 - | format: csv - | path: 'data/csv/example.csv' - | options: - | header: true - | sep: '|' - | - | - |output: - | - name: table1 - | format: parquet - | mode: overwrite - | path: 'data/parquet/example'""".stripMargin - - } - - object Model { - - val input: Input = - Input( - "table1", - "csv", - "/path/path1/file.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - - val output: Output = - Output("table1", "parquet", "overwrite", "/path/path1", Map()) - - val etl: ETL = - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - } - - "DefaultSerializer" should "decode into an Input" in { - - Serializer[Input].decode(Yaml.input) shouldBe Right(Model.input) - } - - it should "decode into an Output" in { - - Serializer[Output].decode(Yaml.output) shouldBe Right(Model.output) - } - - it should "decode into a simple ETL" in { - - Serializer[ETL].decode(Yaml.etl) shouldBe Right(Model.etl) - } -} diff --git a/serializer/src/test/scala/com/eff3ct/teckle/serializer/ExampleSpec.scala b/serializer/src/test/scala/com/eff3ct/teckle/serializer/ExampleSpec.scala deleted file mode 100644 index a81dd35..0000000 --- a/serializer/src/test/scala/com/eff3ct/teckle/serializer/ExampleSpec.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2024 Rafael Fernandez - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.eff3ct.teckle.serializer - -import com.eff3ct.teckle.serializer.model.{ETL, Input, Output} -import com.eff3ct.teckle.serializer.types.PrimitiveType._ -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers - -import scala.io.Source - -class ExampleSpec extends AnyFlatSpecLike with Matchers { - - "ExampleSpec" should "work correctly using default serializer" in { - Serializer[ETL].decode( - Source.fromFile("src/test/resources/simple.yaml").mkString - ) shouldBe - Right( - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - ) - } - - it should "work correctly using yaml serializer" in { - Serializer[ETL].decode( - Source.fromFile("src/test/resources/simple.yaml").mkString - ) shouldBe - Right( - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - ) - } - - it should "work correctly using json serializer" in { - Serializer[ETL].decode( - Source.fromFile("src/test/resources/simple.json").mkString - ) shouldBe - Right( - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - ) - } - -} diff --git a/serializer/src/test/scala/com/eff3ct/teckle/serializer/jsonspec/JsonSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckle/serializer/jsonspec/JsonSerializerSpec.scala deleted file mode 100644 index cbc362c..0000000 --- a/serializer/src/test/scala/com/eff3ct/teckle/serializer/jsonspec/JsonSerializerSpec.scala +++ /dev/null @@ -1,118 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2024 Rafael Fernandez - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.eff3ct.teckle.serializer.jsonspec - -import com.eff3ct.teckle.serializer.Serializer -import com.eff3ct.teckle.serializer.model._ -import com.eff3ct.teckle.serializer.types.PrimitiveType.{BooleanType, CharType} -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers - -class JsonSerializerSpec extends AnyFlatSpecLike with Matchers { - - import com.eff3ct.teckle.serializer.alternative.json - - object Json { - - val input: String = """{ - | "name": "table1", - | "format": "csv", - | "path": "/path/path1/file.csv", - | "options": { - | "header": true, - | "sep": "|" - | } - |}""".stripMargin - - val output: String = """{ - | "name": "table1", - | "format": "parquet", - | "mode": "overwrite", - | "path": "/path/path1" - |}""".stripMargin - - val etl: String = """{ - | "input": [ - | { - | "name": "table1", - | "format": "csv", - | "path": "data/csv/example.csv", - | "options": { - | "header": true, - | "sep": "|" - | } - | } - | ], - | "output": [ - | { - | "name": "table1", - | "format": "parquet", - | "mode": "overwrite", - | "path": "data/parquet/example" - | } - | ] - |}""".stripMargin - - } - - object Model { - - val input: Input = - Input( - "table1", - "csv", - "/path/path1/file.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - - val output: Output = - Output("table1", "parquet", "overwrite", "/path/path1", Map()) - - val etl: ETL = - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - } - - "JsonSerializer" should "decode into an Input" in { - Serializer[Input].decode(Json.input) shouldBe Right(Model.input) - } - - it should "decode into an Output" in { - Serializer[Output].decode(Json.output) shouldBe Right(Model.output) - } - - it should "decode into a simple ETL" in { - Serializer[ETL].decode(Json.etl) shouldBe Right(Model.etl) - } -} diff --git a/serializer/src/test/scala/com/eff3ct/teckle/serializer/yamlspec/YamlSerializerSpec.scala b/serializer/src/test/scala/com/eff3ct/teckle/serializer/yamlspec/YamlSerializerSpec.scala deleted file mode 100644 index 318b5b7..0000000 --- a/serializer/src/test/scala/com/eff3ct/teckle/serializer/yamlspec/YamlSerializerSpec.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2024 Rafael Fernandez - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package com.eff3ct.teckle.serializer.yamlspec - -import com.eff3ct.teckle.serializer.Serializer -import com.eff3ct.teckle.serializer.model._ -import com.eff3ct.teckle.serializer.types.PrimitiveType._ -import org.scalatest.flatspec.AnyFlatSpecLike -import org.scalatest.matchers.should.Matchers - -class YamlSerializerSpec extends AnyFlatSpecLike with Matchers { - - import com.eff3ct.teckle.serializer.alternative.yaml - - object Yaml { - - val input: String = """name: table1 - |format: csv - |path: '/path/path1/file.csv' - |options: - | header: true - | sep: '|'""".stripMargin - - val output: String = """name: table1 - |format: parquet - |mode: overwrite - |path: '/path/path1'""".stripMargin - - val etl: String = """input: - | - name: table1 - | format: csv - | path: 'data/csv/example.csv' - | options: - | header: true - | sep: '|' - | - | - |output: - | - name: table1 - | format: parquet - | mode: overwrite - | path: 'data/parquet/example'""".stripMargin - - } - - object Model { - - val input: Input = - Input( - "table1", - "csv", - "/path/path1/file.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - - val output: Output = - Output("table1", "parquet", "overwrite", "/path/path1", Map()) - - val etl: ETL = - ETL( - List( - Input( - "table1", - "csv", - "data/csv/example.csv", - Map("header" -> BooleanType(true), "sep" -> CharType('|')) - ) - ), - List(Output("table1", "parquet", "overwrite", "data/parquet/example", Map())) - ) - } - - "YamlSerializer" should "decode into an Input" in { - Serializer[Input].decode(Yaml.input) shouldBe Right(Model.input) - } - - it should "decode into an Output" in { - Serializer[Output].decode(Yaml.output) shouldBe Right(Model.output) - } - - it should "decode into a simple ETL" in { - Serializer[ETL].decode(Yaml.etl) shouldBe Right(Model.etl) - } -} From fbe5cc81cec11db7645f2312d0c6ada85a82f7d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:23:12 +0000 Subject: [PATCH 02/20] Fix Sonatype CI/CD (#19) --- .github/release-drafter.yml | 1 + project/BuildPlugin.scala | 8 ++++---- project/SonatypePublish.scala | 8 +++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index 79c51c5..36ec9c0 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -4,6 +4,7 @@ categories: - title: '🔥 Breaking Changes' labels: - 'breaking' + - 'break' - title: '🚀 Features' labels: - 'feature' diff --git a/project/BuildPlugin.scala b/project/BuildPlugin.scala index 6db2389..fb9cf07 100644 --- a/project/BuildPlugin.scala +++ b/project/BuildPlugin.scala @@ -31,10 +31,10 @@ object BuildPlugin extends AutoPlugin { "--add-exports=java.base/sun.nio.ch=ALL-UNNAMED" ), run / javaOptions ++= localJvmSettings, - run / fork := true, - Test / fork := true, - parallelExecution in Test := false, - headerLicense := Some(headerIOLicense), + run / fork := true, + Test / fork := true, + Test / parallelExecution := false, + headerLicense := Some(headerIOLicense), scalacOptions ++= Vector( // "-release:11", "-Ymacro-annotations", diff --git a/project/SonatypePublish.scala b/project/SonatypePublish.scala index 63c9b98..1af08c8 100644 --- a/project/SonatypePublish.scala +++ b/project/SonatypePublish.scala @@ -1,14 +1,16 @@ import sbt.Keys._ import sbt._ +import xerial.sbt.Sonatype.autoImport.{sonatypeCredentialHost, sonatypeRepository} +import xerial.sbt.Sonatype.sonatypeCentralHost import scala.collection.Seq object SonatypePublish { def projectSettings: Seq[Setting[_]] = Seq( - ThisBuild / publish / skip := true, - ThisBuild / versionScheme := Some("early-semver"), - ThisBuild / pomIncludeRepository := { _ => false } + ThisBuild / publish / skip := true, + ThisBuild / versionScheme := Some("early-semver"), + ThisBuild / sonatypeCredentialHost := sonatypeCentralHost ) } From 23064801de0831643ac46c7c0b8211b011213954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:50:21 +0000 Subject: [PATCH 03/20] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 605c260..85939d4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Teckel +[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?event=release)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) + Teckel is a framework designed to simplify the creation of Apache Spark ETL (Extract, Transform, Load) processes using YAML configuration files. This tool aims to standardize and streamline ETL workflow creation by enabling the definition of data transformations in a declarative, user-friendly format without writing extensive code. @@ -67,4 +69,4 @@ Teckel is available under the MIT License. See the [LICENSE](./LICENSE) file for If you have any questions regarding the license, feel free to contact Rafael Fernandez. -For any issues or questions, feel free to open an issue on the GitHub repository. \ No newline at end of file +For any issues or questions, feel free to open an issue on the GitHub repository. From 89d9502e9bbae18b27693913de46bf5ab5d5c3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 19:51:05 +0000 Subject: [PATCH 04/20] Update release.yml --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a794207..6dc9eaa 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,7 +13,7 @@ jobs: - uses: actions/setup-java@v4 with: distribution: temurin - java-version: 8 + java-version: 11 cache: sbt - uses: sbt/setup-sbt@v1 - run: sbt ci-release From 66e108f8e181abca7a83f50ebb811440372e8b5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:06:25 +0000 Subject: [PATCH 05/20] Fix SonatypePublish (#20) --- project/SonatypePublish.scala | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/project/SonatypePublish.scala b/project/SonatypePublish.scala index 1af08c8..4a42d9a 100644 --- a/project/SonatypePublish.scala +++ b/project/SonatypePublish.scala @@ -1,7 +1,7 @@ import sbt.Keys._ import sbt._ -import xerial.sbt.Sonatype.autoImport.{sonatypeCredentialHost, sonatypeRepository} -import xerial.sbt.Sonatype.sonatypeCentralHost +import xerial.sbt.Sonatype.autoImport._ +import xerial.sbt.Sonatype._ import scala.collection.Seq @@ -10,7 +10,28 @@ object SonatypePublish { def projectSettings: Seq[Setting[_]] = Seq( ThisBuild / publish / skip := true, ThisBuild / versionScheme := Some("early-semver"), - ThisBuild / sonatypeCredentialHost := sonatypeCentralHost + ThisBuild / sonatypeCredentialHost := sonatypeCentralHost, + ThisBuild / organization := "com.eff3ct", + ThisBuild / organizationName := "eff3ct", + ThisBuild / homepage := Some(url("https://github.com/rafafrdz/teckel")), + ThisBuild / licenses := Seq("MIT" -> url("https://opensource.org/licenses/MIT")), + ThisBuild / scmInfo := Some( + ScmInfo( + browseUrl = url("https://github.com/rafafrdz/teckel"), + connection = "scm:git:git@github.com:rafafrdz/teckel.git" + ) + ), + ThisBuild / developers := List( + Developer( + id = "rafafrdz", + name = "Rafael Fernandez", + email = "hi@rafaelfernandez.dev", + url = url("https://rafaelfernandez.dev") + ) + ), + ThisBuild / sonatypeProjectHosting := Some( + GitHubHosting("rafafrdz", "teckel", "hi@rafaelfernandez.dev") + ) ) } From 364e6052c1557109c7b05b8c2cb539510d189bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:32:11 +0000 Subject: [PATCH 06/20] Publish modules (#21) --- build.sbt | 18 ++++++++++++++---- sonatype.sbt | 23 ----------------------- 2 files changed, 14 insertions(+), 27 deletions(-) delete mode 100644 sonatype.sbt diff --git a/build.sbt b/build.sbt index c0af8c0..f83ca20 100644 --- a/build.sbt +++ b/build.sbt @@ -23,7 +23,7 @@ lazy val root = lazy val model = (project in file("./model")) .settings( - name := "teckel-model", + name := "teckel-model", libraryDependencies ++= Dependency.model ) @@ -31,9 +31,19 @@ lazy val semantic = (project in file("./semantic")) .dependsOn(model) .settings( - name := "teckel-semantic", + name := "teckel-semantic", libraryDependencies ++= Dependency.semantic - ).withKindProjector + ) + .withKindProjector + +lazy val core = + project + .aggregate(model, semantic) + .dependsOn(model, semantic) + .settings( + name := "teckel-core", + publish / skip := false + ) /** Serializer */ lazy val serializer = @@ -47,7 +57,7 @@ lazy val serializer = lazy val api = (project in file("./api")) - .dependsOn(serializer, semantic) + .dependsOn(serializer, core) .settings( name := "teckel-api", publish / skip := false, diff --git a/sonatype.sbt b/sonatype.sbt deleted file mode 100644 index 73d7209..0000000 --- a/sonatype.sbt +++ /dev/null @@ -1,23 +0,0 @@ -import xerial.sbt.Sonatype.GitHubHosting - -organization := "com.eff3ct" -organizationName := "eff3ct" -homepage := Some(url("https://github.com/rafafrdz/teckel")) -licenses := Seq("MIT" -> url("https://opensource.org/licenses/MIT")) -scmInfo := Some( - ScmInfo( - browseUrl = url("https://github.com/rafafrdz/teckel"), - connection = "scm:git:git@github.com:rafafrdz/teckel.git" - ) -) - -developers := List( - Developer( - id = "rafafrdz", - name = "Rafael Fernandez", - email = "hi@rafaelfernandez.dev", - url = url("https://rafaelfernandez.dev") - ) -) - -sonatypeProjectHosting := Some(GitHubHosting("rafafrdz", "teckel", "hi@rafaelfernandez.dev")) From 805ebd442dcaff1cf5a3d31d274eb281713e2d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:33:45 +0000 Subject: [PATCH 07/20] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 85939d4..21476cd 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Teckel -[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?event=release)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) +[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?branch=master)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) Teckel is a framework designed to simplify the creation of Apache Spark ETL (Extract, Transform, Load) processes using YAML configuration files. This tool aims to standardize and streamline ETL workflow creation by From 267a5cd454b2d643655ba4af72dcc082829cbb96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 23 Dec 2024 20:49:48 +0000 Subject: [PATCH 08/20] Fix Publish (#22) --- build.sbt | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/build.sbt b/build.sbt index f83ca20..95c0231 100644 --- a/build.sbt +++ b/build.sbt @@ -23,27 +23,20 @@ lazy val root = lazy val model = (project in file("./model")) .settings( - name := "teckel-model", - libraryDependencies ++= Dependency.model + name := "teckel-model", + libraryDependencies ++= Dependency.model, + publish / skip := false ) lazy val semantic = (project in file("./semantic")) .dependsOn(model) .settings( - name := "teckel-semantic", - libraryDependencies ++= Dependency.semantic - ) - .withKindProjector - -lazy val core = - project - .aggregate(model, semantic) - .dependsOn(model, semantic) - .settings( - name := "teckel-core", + name := "teckel-semantic", + libraryDependencies ++= Dependency.semantic, publish / skip := false ) + .withKindProjector /** Serializer */ lazy val serializer = @@ -57,7 +50,7 @@ lazy val serializer = lazy val api = (project in file("./api")) - .dependsOn(serializer, core) + .dependsOn(model, semantic, serializer) .settings( name := "teckel-api", publish / skip := false, From 22bcb505c422e5585cc819e06bfedc8bf630805c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Tue, 24 Dec 2024 16:07:36 +0000 Subject: [PATCH 09/20] Update README.md (#23) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 21476cd..5013d19 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Teckel -[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?branch=master)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) +[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?branch=master)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) Teckel is a framework designed to simplify the creation of Apache Spark ETL (Extract, Transform, Load) processes using YAML configuration files. This tool aims to standardize and streamline ETL workflow creation by From 235a18c9386b87c5adaac69f035482a289fd71fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Wed, 25 Dec 2024 11:48:11 +0000 Subject: [PATCH 10/20] Add Teckel CLI (#24) --- .../com/eff3ct/teckel/api/core/ETL.scala | 45 ++++++++++++++++ .../eff3ct/teckel/api/{etl => core}/Run.scala | 16 +++--- .../api/{etl/package.scala => data.scala} | 13 ++--- .../scala/com/eff3ct/teckel/api/file.scala | 39 ++++++++++++++ .../scala/com/eff3ct/teckel/api/package.scala | 38 +++++++++++++ .../eff3ct/teckel/api/spark/SparkETL.scala | 18 +++---- .../com/eff3ct/teckel/api/ExampleSpec.scala | 13 +++-- build.sbt | 8 +++ .../scala/com/eff3ct/teckel/app/Main.scala | 54 +++++++++++++++++++ .../scala/com/eff3ct/teckel/io/Console.scala | 54 +++++++++++++++++++ .../scala/com/eff3ct/teckel/io/Parser.scala | 41 ++++++++++++++ .../example/{ => data}/EffectExample.scala | 12 ++--- .../api/example/{ => data}/Example.scala | 12 ++--- .../example/{ => data}/UnsafeExample.scala | 8 ++- .../teckel/api/example/data/package.scala | 43 +++++++++++++++ .../api/example/file/EffectExample.scala | 46 ++++++++++++++++ .../teckel/api/example/file/Example.scala | 46 ++++++++++++++++ .../api/example/file/UnsafeExample.scala | 41 ++++++++++++++ 18 files changed, 498 insertions(+), 49 deletions(-) create mode 100644 api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala rename api/src/main/scala/com/eff3ct/teckel/api/{etl => core}/Run.scala (77%) rename api/src/main/scala/com/eff3ct/teckel/api/{etl/package.scala => data.scala} (79%) create mode 100644 api/src/main/scala/com/eff3ct/teckel/api/file.scala create mode 100644 api/src/main/scala/com/eff3ct/teckel/api/package.scala create mode 100644 cli/src/main/scala/com/eff3ct/teckel/app/Main.scala create mode 100644 cli/src/main/scala/com/eff3ct/teckel/io/Console.scala create mode 100644 cli/src/main/scala/com/eff3ct/teckel/io/Parser.scala rename example/src/main/scala/com/eff3ct/teckel/api/example/{ => data}/EffectExample.scala (83%) rename example/src/main/scala/com/eff3ct/teckel/api/example/{ => data}/Example.scala (83%) rename example/src/main/scala/com/eff3ct/teckel/api/example/{ => data}/UnsafeExample.scala (83%) create mode 100644 example/src/main/scala/com/eff3ct/teckel/api/example/data/package.scala create mode 100644 example/src/main/scala/com/eff3ct/teckel/api/example/file/EffectExample.scala create mode 100644 example/src/main/scala/com/eff3ct/teckel/api/example/file/Example.scala create mode 100644 example/src/main/scala/com/eff3ct/teckel/api/example/file/UnsafeExample.scala diff --git a/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala b/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala new file mode 100644 index 0000000..7cd1e9f --- /dev/null +++ b/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala @@ -0,0 +1,45 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api.core + +import cats.Id +import cats.effect.unsafe.implicits.global +import cats.effect.{Concurrent, IO} +import com.eff3ct.teckel.semantic.core.EvalContext +import fs2.io.file.{Files, Path} + +object ETL { + + def apply[F[_]: Run]: Run[F] = Run[F] + + def unsafe[O: EvalContext](data: String): O = Run[Id].run(data) + + def fromFile[F[_]: Files: Concurrent: Run, O: EvalContext](path: String): F[O] = + Files[F].readUtf8(Path(path)).evalMap(Run[F].run[O]).compile.lastOrError + + def usafeFromFile[O: EvalContext](path: String): O = + fromFile[IO, O](path).unsafeRunSync() + +} diff --git a/api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala b/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala similarity index 77% rename from api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala rename to api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala index 3f86cb2..175ccb4 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/etl/Run.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala @@ -22,7 +22,7 @@ * SOFTWARE. */ -package com.eff3ct.teckel.api.etl +package com.eff3ct.teckel.api.core import cats.effect.IO import cats.effect.unsafe.implicits.global @@ -32,27 +32,25 @@ import com.eff3ct.teckel.semantic.core.EvalContext import com.eff3ct.teckel.serializer._ import com.eff3ct.teckel.serializer.model.etl._ import com.eff3ct.teckel.transform.Rewrite -import fs2.io.file.{Files, Path} trait Run[F[_]] { - def run[O: EvalContext](path: String): F[O] + def run[O: EvalContext](data: String): F[O] } object Run { def apply[F[_]: Run]: Run[F] = implicitly[Run[F]] - implicit def runF[F[_]: Compile: Files: MonadThrow]: Run[F] = new Run[F] { - override def run[O: EvalContext](path: String): F[O] = + implicit def runF[F[_]: MonadThrow]: Run[F] = new Run[F] { + override def run[O: EvalContext](data: String): F[O] = for { - data <- Files[F].readUtf8(Path(path)).compile.lastOrError - etl <- MonadThrow[F].fromEither(Serializer[ETL].decode(data)) + etl <- MonadThrow[F].fromEither(Serializer[ETL].decode(data)) context = Rewrite.rewrite(etl) } yield EvalContext[O].eval(context) } implicit val runId: Run[Id] = new Run[Id] { - override def run[O: EvalContext](path: String): Id[O] = - Run[IO].run(path).unsafeRunSync() + override def run[O: EvalContext](data: String): Id[O] = + Run[IO].run(data).unsafeRunSync() } } diff --git a/api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala b/api/src/main/scala/com/eff3ct/teckel/api/data.scala similarity index 79% rename from api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala rename to api/src/main/scala/com/eff3ct/teckel/api/data.scala index 8b27270..1ea044b 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/etl/package.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/data.scala @@ -24,16 +24,13 @@ package com.eff3ct.teckel.api -import cats.Id import cats.effect.IO +import com.eff3ct.teckel.api.core.{ETL, Run} import com.eff3ct.teckel.semantic.core.EvalContext -import fs2.Compiler -package object etl { +object data { + def etl[F[_]: Run, O: EvalContext](data: String): F[O] = ETL[F].run[O](data) + def etlIO[O: EvalContext](data: String): IO[O] = ETL[IO].run[O](data) + def unsafeETL[O: EvalContext](data: String): O = ETL.unsafe(data) - type Compile[F[_]] = Compiler[F, F] - - def etlF[F[_]: Run, O: EvalContext](path: String): F[O] = Run[F].run(path) - def etl[O: EvalContext](path: String): IO[O] = Run[IO].run(path) - def unsafeETL[O: EvalContext](path: String): O = Run[Id].run(path) } diff --git a/api/src/main/scala/com/eff3ct/teckel/api/file.scala b/api/src/main/scala/com/eff3ct/teckel/api/file.scala new file mode 100644 index 0000000..7e0a704 --- /dev/null +++ b/api/src/main/scala/com/eff3ct/teckel/api/file.scala @@ -0,0 +1,39 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api + +import cats.effect.{Concurrent, IO} +import com.eff3ct.teckel.api.core.{ETL, Run} +import com.eff3ct.teckel.semantic.core.EvalContext +import fs2.io.file.Files + +object file { + + def etl[F[_]: Files: Concurrent: Run, O: EvalContext](path: String): F[O] = + ETL.fromFile[F, O](path) + def etlIO[O: EvalContext](path: String): IO[O] = ETL.fromFile[IO, O](path) + def unsafeETL[O: EvalContext](path: String): O = ETL.usafeFromFile[O](path) + +} diff --git a/api/src/main/scala/com/eff3ct/teckel/api/package.scala b/api/src/main/scala/com/eff3ct/teckel/api/package.scala new file mode 100644 index 0000000..c48ff98 --- /dev/null +++ b/api/src/main/scala/com/eff3ct/teckel/api/package.scala @@ -0,0 +1,38 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel + +import cats.effect.IO +import com.eff3ct.teckel.api.core.Run +import com.eff3ct.teckel.api.{data => d} +import com.eff3ct.teckel.semantic.core.EvalContext + +package object api { + + def etl[F[_]: Run, O: EvalContext](data: String): F[O] = d.etl[F, O](data) + def etlIO[O: EvalContext](data: String): IO[O] = d.etlIO[O](data) + def unsafeETL[O: EvalContext](data: String): O = d.unsafeETL[O](data) + +} diff --git a/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala b/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala index 379340b..bb390c8 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/spark/SparkETL.scala @@ -47,20 +47,14 @@ trait SparkETL extends IOApp { SparkSession.builder().config(sparkConf).master(master).appName(appName).getOrCreate() } - /** - * Logger instance for the ETL - * @return logger instance - */ - private final def logger: Logger = LoggerFactory.getLogger(s"[ETL][$etlName]") - /** * Run the ETL. This method should be implemented by the ETL. * @param spark Spark session * @param logger logger */ - def unsafeRun(implicit spark: SparkSession, logger: Logger): Unit = { + def unsafeRun(args: List[String])(implicit spark: SparkSession, logger: Logger): Unit = { import cats.effect.unsafe.implicits.global - runIO(spark, logger).unsafeRunSync() + runIO(args)(spark, logger).unsafeRunSync() } /** @@ -69,8 +63,8 @@ trait SparkETL extends IOApp { * @param logger logger * @return IO */ - def runIO(implicit spark: SparkSession, logger: Logger): IO[Unit] = - IO(unsafeRun(spark, logger)) + def runIO(args: List[String])(implicit spark: SparkSession, logger: Logger): IO[ExitCode] = + IO(unsafeRun(args)(spark, logger)).map(_ => ExitCode.Success) /** * Main method to run the ETL @@ -78,6 +72,8 @@ trait SparkETL extends IOApp { */ final override def run(args: List[String]): IO[ExitCode] = { @transient lazy val spark: SparkSession = sparkBuilder() - runIO(spark, logger).as(ExitCode.Success) + @transient lazy val logger: Logger = LoggerFactory.getLogger(s"[ETL][$etlName]") + runIO(args)(spark, logger) } + } diff --git a/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala index 3d961b9..18a0833 100644 --- a/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala +++ b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala @@ -24,9 +24,9 @@ package com.eff3ct.teckel.api +import cats.effect.IO import cats.effect.unsafe.implicits.global -import com.eff3ct.teckel.api.etl.{etl, unsafeETL} -import com.eff3ct.teckel.semantic.evaluation._ +import com.eff3ct.teckel.api.file._ import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession @@ -44,8 +44,13 @@ class ExampleSpec extends AnyFlatSpecLike with Matchers { implicit val spark: SparkSession = sparkBuilder() - "ExampleSpec" should "work correctly in a ETL" in { - noException should be thrownBy etl[Unit]("src/test/resources/etl/simple.yaml").unsafeRunSync() + "ExampleSpec" should "work correctly in a ETL F using IO" in { + noException should be thrownBy etl[IO, Unit]("src/test/resources/etl/simple.yaml") + .unsafeRunSync() + } + + it should "work correctly in a ETL IO" in { + noException should be thrownBy etlIO[Unit]("src/test/resources/etl/simple.yaml").unsafeRunSync() } it should "work correctly in an unsafe ETL" in { diff --git a/build.sbt b/build.sbt index 95c0231..b13cdd7 100644 --- a/build.sbt +++ b/build.sbt @@ -57,6 +57,14 @@ lazy val api = libraryDependencies ++= Dependency.api ) +lazy val cli = + (project in file("./cli")) + .dependsOn(api) + .settings( + name := "teckel-cli", + publish / skip := false, + ) + lazy val example = (project in file("./example")) .dependsOn(api) diff --git a/cli/src/main/scala/com/eff3ct/teckel/app/Main.scala b/cli/src/main/scala/com/eff3ct/teckel/app/Main.scala new file mode 100644 index 0000000..e52a771 --- /dev/null +++ b/cli/src/main/scala/com/eff3ct/teckel/app/Main.scala @@ -0,0 +1,54 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.app + +import cats.effect.{Async, ExitCode, IO} +import com.eff3ct.teckel.api.core.Run +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.io.Console +import com.eff3ct.teckel.semantic.core.EvalContext +import com.eff3ct.teckel.semantic.execution._ +import fs2.io.file.Files +import org.apache.spark.sql.SparkSession +import org.slf4j + +object Main extends SparkETL { + + override def runIO( + args: List[String] + )(implicit spark: SparkSession, logger: slf4j.Logger): IO[ExitCode] = + execute[IO, Unit](args).compile.drain.as(ExitCode.Success) + + def execute[F[_]: Files: Async: Run, O: EvalContext](args: List[String]): fs2.Stream[F, O] = + for { + commands <- Console.command[F](args) + result <- Console.eval[F, O](commands) + } yield result + + /** + * Name of the ETL + */ + override val etlName: String = "spark-etl-cli" +} diff --git a/cli/src/main/scala/com/eff3ct/teckel/io/Console.scala b/cli/src/main/scala/com/eff3ct/teckel/io/Console.scala new file mode 100644 index 0000000..8a278ca --- /dev/null +++ b/cli/src/main/scala/com/eff3ct/teckel/io/Console.scala @@ -0,0 +1,54 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.io + +import cats.effect.{Async, Sync} +import com.eff3ct.teckel.api.core.Run +import com.eff3ct.teckel.semantic.core.EvalContext +import fs2.io.file.Files + +object Console { + + sealed trait Commands + case object STDIN extends Commands + case class FILE(file: String) extends Commands + + def parseCommand(args: List[String]): Commands = + args match { + case "-c" :: Nil => STDIN + case "-f" :: file :: Nil => FILE(file) + case _ => throw new IllegalArgumentException("Invalid arguments") + } + + def eval[F[_]: Files: Async: Run, O: EvalContext](commands: Commands): fs2.Stream[F, O] = + commands match { + case STDIN => Parser.parseStdin[F, O] + case FILE(file) => Parser.parseFile[F, O](file) + } + + def command[F[_]: Sync](args: List[String]): fs2.Stream[F, Commands] = + fs2.Stream.eval(Sync[F].delay(parseCommand(args))) + +} diff --git a/cli/src/main/scala/com/eff3ct/teckel/io/Parser.scala b/cli/src/main/scala/com/eff3ct/teckel/io/Parser.scala new file mode 100644 index 0000000..7432a16 --- /dev/null +++ b/cli/src/main/scala/com/eff3ct/teckel/io/Parser.scala @@ -0,0 +1,41 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.io + +import cats.effect.Async +import com.eff3ct.teckel.api.core.Run +import com.eff3ct.teckel.semantic.core.EvalContext +import fs2.io.file.{Files, Path} +import fs2.io.stdinUtf8 + +object Parser { + + def parseFile[F[_]: Files: Run, O: EvalContext](file: String): fs2.Stream[F, O] = + Files[F].readUtf8(Path(file)).evalMap(Run[F].run[O]) + + def parseStdin[F[_]: Async: Run, O: EvalContext]: fs2.Stream[F, O] = + stdinUtf8(1024).evalMap(Run[F].run[O]) + +} diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/data/EffectExample.scala similarity index 83% rename from example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/data/EffectExample.scala index 1ae6097..fc7ae21 100644 --- a/example/src/main/scala/com/eff3ct/teckel/api/example/EffectExample.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/data/EffectExample.scala @@ -22,12 +22,10 @@ * SOFTWARE. */ -package com.eff3ct.teckel.api.example +package com.eff3ct.teckel.api.example.data -import cats.effect.IO -import com.eff3ct.teckel.api.etl.etlF +import cats.effect.{ExitCode, IO} import com.eff3ct.teckel.api.spark.SparkETL -import com.eff3ct.teckel.semantic.evaluation._ import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger @@ -39,6 +37,8 @@ object EffectExample extends SparkETL { */ override val etlName: String = "Effect Example" - override def runIO(implicit spark: SparkSession, logger: Logger): IO[Unit] = - etlF[IO, Unit]("example/src/main/resources/etl/simple.yaml") + override def runIO( + args: List[String] + )(implicit spark: SparkSession, logger: Logger): IO[ExitCode] = + com.eff3ct.teckel.api.etl[IO, Unit](yaml).as(ExitCode.Success) } diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/data/Example.scala similarity index 83% rename from example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/data/Example.scala index f97759d..9de48f9 100644 --- a/example/src/main/scala/com/eff3ct/teckel/api/example/Example.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/data/Example.scala @@ -22,12 +22,10 @@ * SOFTWARE. */ -package com.eff3ct.teckel.api.example +package com.eff3ct.teckel.api.example.data -import cats.effect.IO -import com.eff3ct.teckel.api.etl.etl +import cats.effect.{ExitCode, IO} import com.eff3ct.teckel.api.spark.SparkETL -import com.eff3ct.teckel.semantic.evaluation._ import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger @@ -39,6 +37,8 @@ object Example extends SparkETL { */ override val etlName: String = "Example" - override def runIO(implicit spark: SparkSession, logger: Logger): IO[Unit] = - etl[Unit]("example/src/main/resources/etl/simple.yaml") + override def runIO( + args: List[String] + )(implicit spark: SparkSession, logger: Logger): IO[ExitCode] = + com.eff3ct.teckel.api.etlIO[Unit](yaml).as(ExitCode.Success) } diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/data/UnsafeExample.scala similarity index 83% rename from example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala rename to example/src/main/scala/com/eff3ct/teckel/api/example/data/UnsafeExample.scala index 9cfa277..33166f6 100644 --- a/example/src/main/scala/com/eff3ct/teckel/api/example/UnsafeExample.scala +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/data/UnsafeExample.scala @@ -22,11 +22,9 @@ * SOFTWARE. */ -package com.eff3ct.teckel.api.example +package com.eff3ct.teckel.api.example.data -import com.eff3ct.teckel.api.etl.unsafeETL import com.eff3ct.teckel.api.spark.SparkETL -import com.eff3ct.teckel.semantic.evaluation._ import com.eff3ct.teckel.semantic.execution._ import org.apache.spark.sql.SparkSession import org.slf4j.Logger @@ -38,6 +36,6 @@ object UnsafeExample extends SparkETL { */ override val etlName: String = "Unsafe Example" - override def unsafeRun(implicit spark: SparkSession, logger: Logger): Unit = - unsafeETL[Unit]("example/src/main/resources/etl/simple.yaml") + override def unsafeRun(args: List[String])(implicit spark: SparkSession, logger: Logger): Unit = + com.eff3ct.teckel.api.unsafeETL[Unit](yaml) } diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/data/package.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/data/package.scala new file mode 100644 index 0000000..e31c85e --- /dev/null +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/data/package.scala @@ -0,0 +1,43 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api.example + +package object data { + val yaml: String = + """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'"""".stripMargin +} diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/file/EffectExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/file/EffectExample.scala new file mode 100644 index 0000000..e201c96 --- /dev/null +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/file/EffectExample.scala @@ -0,0 +1,46 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api.example.file + +import cats.effect.{ExitCode, IO} +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.execution._ +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger + +object EffectExample extends SparkETL { + + /** + * Name of the ETL + */ + override val etlName: String = "Effect Example" + + override def runIO( + args: List[String] + )(implicit spark: SparkSession, logger: Logger): IO[ExitCode] = + com.eff3ct.teckel.api.file + .etl[IO, Unit]("example/src/main/resources/etl/simple.yaml") + .as(ExitCode.Success) +} diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/file/Example.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/file/Example.scala new file mode 100644 index 0000000..f9c7d68 --- /dev/null +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/file/Example.scala @@ -0,0 +1,46 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api.example.file + +import cats.effect.{ExitCode, IO} +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.execution._ +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger + +object Example extends SparkETL { + + /** + * Name of the ETL + */ + override val etlName: String = "Example" + + override def runIO( + args: List[String] + )(implicit spark: SparkSession, logger: Logger): IO[ExitCode] = + com.eff3ct.teckel.api.file + .etlIO[Unit]("example/src/main/resources/etl/simple.yaml") + .as(ExitCode.Success) +} diff --git a/example/src/main/scala/com/eff3ct/teckel/api/example/file/UnsafeExample.scala b/example/src/main/scala/com/eff3ct/teckel/api/example/file/UnsafeExample.scala new file mode 100644 index 0000000..e8ac1cc --- /dev/null +++ b/example/src/main/scala/com/eff3ct/teckel/api/example/file/UnsafeExample.scala @@ -0,0 +1,41 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api.example.file + +import com.eff3ct.teckel.api.spark.SparkETL +import com.eff3ct.teckel.semantic.execution._ +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger + +object UnsafeExample extends SparkETL { + + /** + * Name of the ETL + */ + override val etlName: String = "Unsafe Example" + + override def unsafeRun(args: List[String])(implicit spark: SparkSession, logger: Logger): Unit = + com.eff3ct.teckel.api.file.unsafeETL[Unit]("example/src/main/resources/etl/simple.yaml") +} From b0efd19bbce21a3e57b000a745e43b989429233c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Fri, 27 Dec 2024 18:14:26 +0100 Subject: [PATCH 11/20] patch: add cli in root project --- build.sbt | 1 + 1 file changed, 1 insertion(+) diff --git a/build.sbt b/build.sbt index b13cdd7..596588c 100644 --- a/build.sbt +++ b/build.sbt @@ -12,6 +12,7 @@ lazy val root = semantic, serializer, api, + cli, example ) From 2f059e9f3bfec5f2921368addac4a7c6b5f38a6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 30 Dec 2024 20:58:54 +0100 Subject: [PATCH 12/20] Update dependencies (#26) --- project/Dependency.scala | 5 ---- project/Library.scala | 25 ++----------------- project/Version.scala | 13 +++------- .../eff3ct/teckel/serializer/model/etl.scala | 5 +--- .../teckel/serializer/model/input.scala | 7 +++--- .../teckel/serializer/model/operations.scala | 9 ++----- .../teckel/serializer/model/output.scala | 7 +++--- .../serializer/model/transformation.scala | 12 +++------ .../eff3ct/teckel/serializer/package.scala | 4 ++- 9 files changed, 22 insertions(+), 65 deletions(-) diff --git a/project/Dependency.scala b/project/Dependency.scala index 73bdbff..9adb912 100644 --- a/project/Dependency.scala +++ b/project/Dependency.scala @@ -3,13 +3,10 @@ import sbt._ object Dependency { - lazy val provided: String = "provided" - /** Modules */ lazy val model: Seq[ModuleID] = Seq( - estatico.newtype, cats.core, cats.laws ) @@ -27,8 +24,6 @@ object Dependency { circe.parser, circe.generic, circe.yaml, - tofu.core, - tofu.circe, catsEffect.core, catsEffect.std, fs2.io, diff --git a/project/Library.scala b/project/Library.scala index f1c22d9..42b55b7 100644 --- a/project/Library.scala +++ b/project/Library.scala @@ -19,9 +19,9 @@ object Library { } object circe { - lazy val parser = "io.circe" %% "circe-parser" % Version.Circe - lazy val yaml = "io.circe" %% "circe-yaml" % Version.Circe + lazy val parser = "io.circe" %% "circe-parser" % Version.Circe lazy val generic = "io.circe" %% "circe-generic" % Version.Circe + lazy val yaml = "io.circe" %% "circe-yaml" % Version.CirceYaml } object fs2 { @@ -29,27 +29,6 @@ object Library { lazy val io: ModuleID = "co.fs2" %% "fs2-io" % Version.Fs2 } - object tofu { - lazy val core = "tf.tofu" %% "derevo-core" % Version.Tofu - lazy val circe = "tf.tofu" %% "derevo-circe-magnolia" % Version.Tofu - } - - object estatico { - lazy val newtype: ModuleID = "io.estatico" %% "newtype" % Version.Estatico - } - - object pureconfig { - lazy val pureconfig: ModuleID = "com.github.pureconfig" %% "pureconfig" % Version.Pureconfig - } - - object database { - lazy val postgresql: ModuleID = "org.postgresql" % "postgresql" % Version.Postgres - } - - object hashicorp { - lazy val vault: ModuleID = "io.github.jopenlibs" % "vault-java-driver" % Version.Vault - } - object test { lazy val scalaTest: ModuleID = "org.scalatest" %% "scalatest" % Version.ScalaTest } diff --git a/project/Version.scala b/project/Version.scala index b5b1083..2cd61d5 100644 --- a/project/Version.scala +++ b/project/Version.scala @@ -7,16 +7,11 @@ object Version { lazy val Cats: String = "2.12.0" lazy val CatsEffect: String = "3.5.5" - lazy val Pureconfig: String = "0.17.4" - lazy val ScalaTest: String = "3.2.9" - lazy val Postgres: String = "42.7.4" - - lazy val Circe = "0.13.0" - lazy val Tofu = "0.13.0" - lazy val Estatico: String = "0.4.4" - lazy val Fs2: String = "3.9.3" + lazy val CirceYaml = "1.15.0" + lazy val Circe = "0.14.4" + lazy val Fs2: String = "3.9.3" + lazy val ScalaTest: String = "3.2.9" lazy val HoldenVersion: String = "3.5.3_2.0.1" - lazy val Vault: String = "6.2.0" } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala index 5140784..257e11a 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala @@ -28,12 +28,9 @@ import cats.data.NonEmptyList import com.eff3ct.teckel.serializer.model.input._ import com.eff3ct.teckel.serializer.model.output._ import com.eff3ct.teckel.serializer.model.transformation._ -import derevo.circe.magnolia.{decoder, encoder} -import derevo.derive - +import io.circe.generic.auto._ object etl { - @derive(encoder, decoder) case class ETL( input: NonEmptyList[Input], transformation: Option[NonEmptyList[Transformation]], diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala index f6b933b..593f2c6 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/input.scala @@ -26,13 +26,11 @@ package com.eff3ct.teckel.serializer.model import com.eff3ct.teckel.serializer.types.PrimitiveType import com.eff3ct.teckel.serializer.types.implicits._ -import derevo.circe.magnolia.encoder -import derevo.derive -import io.circe.{Decoder, HCursor} +import io.circe.generic.semiauto._ +import io.circe.{Decoder, Encoder, HCursor} object input { - @derive(encoder) case class Input( name: String, format: String, @@ -53,4 +51,5 @@ object input { } yield Input(name, format, path, options) } + implicit val encodeInput: Encoder[Input] = deriveEncoder[Input] } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala index 49734ba..6799a31 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala @@ -26,8 +26,7 @@ package com.eff3ct.teckel.serializer.model import cats.data.NonEmptyList import cats.implicits._ -import derevo.circe.magnolia.{decoder, encoder} -import derevo.derive +import io.circe.generic.auto._ import io.circe.syntax._ import io.circe.{Decoder, Encoder} @@ -51,14 +50,10 @@ object operations { Decoder[OrderByOp].widen ).reduceLeft(_ or _) - @derive(encoder, decoder) case class SelectOp(from: String, columns: NonEmptyList[String]) extends Operation - @derive(encoder, decoder) - case class WhereOp(from: String, filter: String) extends Operation - @derive(encoder, decoder) + case class WhereOp(from: String, filter: String) extends Operation case class GroupByOp(from: String, by: NonEmptyList[String], agg: NonEmptyList[String]) extends Operation - @derive(encoder, decoder) case class OrderByOp(from: String, by: NonEmptyList[String], order: Option[String]) extends Operation diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala index 7fb2b39..db12e0d 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/output.scala @@ -26,13 +26,11 @@ package com.eff3ct.teckel.serializer.model import com.eff3ct.teckel.serializer.types.PrimitiveType import com.eff3ct.teckel.serializer.types.implicits._ -import derevo.circe.magnolia.encoder -import derevo.derive -import io.circe.{Decoder, HCursor} +import io.circe.generic.semiauto._ +import io.circe.{Decoder, Encoder, HCursor} object output { - @derive(encoder) case class Output( name: String, format: String, @@ -56,4 +54,5 @@ object output { } yield Output(name, format, mode, path, options) } + implicit val encodeOutput: Encoder[Output] = deriveEncoder[Output] } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala index d6d614e..eebb1c8 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala @@ -26,8 +26,7 @@ package com.eff3ct.teckel.serializer.model import cats.implicits._ import com.eff3ct.teckel.serializer.model.operations._ -import derevo.circe.magnolia.{decoder, encoder} -import derevo.derive +import io.circe.generic.auto._ import io.circe.syntax._ import io.circe.{Decoder, Encoder} @@ -50,12 +49,9 @@ object transformation { Decoder[OrderBy].widen ).reduceLeft(_ or _) - @derive(encoder, decoder) - case class Select(name: String, select: SelectOp) extends Transformation - @derive(encoder, decoder) - case class Where(name: String, where: WhereOp) extends Transformation - @derive(encoder, decoder) + case class Select(name: String, select: SelectOp) extends Transformation + case class Where(name: String, where: WhereOp) extends Transformation case class GroupBy(name: String, group: GroupByOp) extends Transformation - @derive(encoder, decoder) case class OrderBy(name: String, order: OrderByOp) extends Transformation + } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala index 51b1be1..833f298 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/package.scala @@ -25,8 +25,10 @@ package com.eff3ct.teckel import io.circe._ +import io.circe.generic.AutoDerivation -package object serializer { +package object serializer extends AutoDerivation { implicit def default[T: Encoder: Decoder]: Serializer[T] = alternative.yaml + } From 01e718fd24d8654a58bdd869bf7688eff46bcb8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Tue, 31 Dec 2024 12:00:55 +0100 Subject: [PATCH 13/20] Teckel ETL Framework Assembly (#27) --- README.md | 113 ++++++++++++++++++++++++++++++++++++--- build.sbt | 17 +++--- project/Assembly.scala | 15 +++--- project/Dependency.scala | 23 +++++++- project/Library.scala | 6 +-- 5 files changed, 147 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 5013d19..1bf9ef7 100644 --- a/README.md +++ b/README.md @@ -25,39 +25,138 @@ blog: [Big Data with Zero Code](https://blog.rafaelfernandez.dev/posts/big-data- - **Apache Spark**: Ensure you have Apache Spark installed and properly configured. - **YAML files**: Create configuration files specifying your data sources and transformations. +#### Deployment on Docker or Kubernetes + +In case of you don't have Apache Spark installed previously, you can deploy an Apache Spark cluster using the following +docker image [ +`eff3ct/spark:latest`](https://hub.docker.com/r/eff3ct/spark) available in +the [eff3ct0/spark-docker](https://github.com/eff3ct0/spark-docker) Github repository. + ### Installation -To use Teckel, you can clone the repository and integrate it into your Spark setup: +Clone the Teckel repository and integrate it with your existing Spark setup: ```bash git clone https://github.com/rafafrdz/teckel.git cd teckel ``` -**TODO: Add instructions for building the project and integrating it into your Spark setup.** +#### Building the Teckel ETL Uber JAR + +Build the Teckel ETL CLI into an Uber JAR using the following command: + +```bash +sbt cli/assembly +``` + +The resulting JAR, `teckel-etl_2.13.jar`, will be located in the `cli/target/scala-2.13/` directory. + +> [!IMPORTANT] **Teckel CLI as dependency / Teckel ETL as framework.** +> +> The Teckel CLI is a standalone application that can be used as a dependency in your project. Notice that the uber jar +> name is `teckel-etl` and not `teckel-cli` or `teckel-cli-assembly`. This is because +> we want to distinguish between the Teckel CLI dependency and the ETL framework. + +### Usage in Apache Spark -### Usage +Once the `teckel-etl_2.13.jar`is ready, use it to execute ETL processes on Apache Spark with the following arguments: -Once you have installed Teckel, you can use it to run ETL processes. +- `-f` or `--file`: The path to the ETL file. +- `-c` or `--console`: Run the ETL in the console. -**TODO: Add instructions for running ETL processes using Teckel.** +#### Example: Running ETL in Apache Spark using STDIN -## ETL Yaml Example Specification +To run the ETL in the **console**, you can use the following command: + +```bash +cat << EOF | /opt/spark/bin/spark-submit --class com.eff3ct.teckel.app.Main teckel-etl_2.13.jar -c +input: + - name: table1 + format: csv + path: '/path/to/data/file.csv' + options: + header: true + sep: '|' + + +output: + - name: table1 + format: parquet + mode: overwrite + path: '/path/to/output/' +EOF +``` + +#### Example: Running ETL in Apache Spark using a file + +To run the ETL from a **file**, you can use the following command: + +```bash +/opt/spark/bin/spark-submit --class com.eff3ct.teckel.app.Main teckel-etl_2.13.jar -f /path/to/etl/file.yaml +``` + +## Integration with Apache Spark + +### As Dependency + +Teckel can be integrated with Apache Spark easily just adding either the Teckel CLI or Teckel Api as a +dependency in your project. + +#### SBT + +In your `build.sbt` file, add the following dependency: + +```scala +libraryDependencies += "com.eff3ct" %% "teckel-cli" % "" +// or +libraryDependencies += "com.eff3ct" %% "teckel-api" % "" +``` + +### As Framework + +Teckel can also be used as a framework in your Apache Spark project by including the Teckel ETL Uber JAR in your Apache +Spark ecosystem. + +Build the Teckel ETL CLI into an Uber JAR using the following command: + +```bash +sbt cli/assembly +``` + +#### Local Spark Environment Setup + +Copy the Teckel ETL Uber JAR to the `/opt/spark/jars/` directory in your Apache Spark ecosystem: + +```bash +cp cli/target/scala-2.13/teckel-etl_2.13.jar /opt/spark/jars/ +``` + +#### Docker Usage + +Mount the Teckel ETL Uber JAR in your Docker container: + +```bash +docker run -v ./cli/target/scala-2.13/teckel-etl_2.13.jar:/app/teckel-etl_2.13.jar -it eff3ct/spark:latest /bin/bash + +``` + +## ETL Yaml Example Here's an example of a fully defined ETL configuration using a YAML file: ### SQL ETL + - Simple Example: [here](./docs/etl/simple.yaml) - Complex Example: [here](./docs/etl/complex.yaml) - Other Example: [here](./docs/etl/example.yaml) ### SQL Transformations + - `Select` Example: [here](./docs/etl/select.yaml) - `Where` Example: [here](./docs/etl/where.yaml) - `Group By` Example: [here](./docs/etl/group-by.yaml) - `Order By` Example: [here](./docs/etl/order-by.yaml) - ## Development and Contribution Contributions to Teckel are welcome. If you'd like to contribute, please fork the repository and create a pull request diff --git a/build.sbt b/build.sbt index 596588c..d5a4572 100644 --- a/build.sbt +++ b/build.sbt @@ -12,8 +12,7 @@ lazy val root = semantic, serializer, api, - cli, - example + cli ) /** @@ -27,7 +26,7 @@ lazy val model = name := "teckel-model", libraryDependencies ++= Dependency.model, publish / skip := false - ) + ).withNoAssembly lazy val semantic = (project in file("./semantic")) @@ -36,7 +35,7 @@ lazy val semantic = name := "teckel-semantic", libraryDependencies ++= Dependency.semantic, publish / skip := false - ) + ).withNoAssembly .withKindProjector /** Serializer */ @@ -47,7 +46,7 @@ lazy val serializer = name := "teckel-serializer", publish / skip := false, libraryDependencies ++= Dependency.serializer - ) + ).withNoAssembly lazy val api = (project in file("./api")) @@ -56,7 +55,7 @@ lazy val api = name := "teckel-api", publish / skip := false, libraryDependencies ++= Dependency.api - ) + ).withNoAssembly lazy val cli = (project in file("./cli")) @@ -64,11 +63,13 @@ lazy val cli = .settings( name := "teckel-cli", publish / skip := false, - ) + libraryDependencies ++= Dependency.sparkD + ).withAssembly("teckel-etl") lazy val example = (project in file("./example")) .dependsOn(api) .settings( - name := "teckel-example" + name := "teckel-example", + libraryDependencies ++= Dependency.sparkD ) diff --git a/project/Assembly.scala b/project/Assembly.scala index 419a6c4..5103481 100644 --- a/project/Assembly.scala +++ b/project/Assembly.scala @@ -1,15 +1,13 @@ -import sbt.Keys.{artifact, name, scalaBinaryVersion, version} +import sbt.Keys.{artifact, name, scalaBinaryVersion} import sbt.librarymanagement.Artifact -import sbt.{addArtifact, Compile, Setting} +import sbt.{Compile, Setting, addArtifact} import sbtassembly.AssemblyKeys.{assembly, assemblyJarName, assemblyMergeStrategy} import sbtassembly.AssemblyPlugin.autoImport.MergeStrategy import sbtassembly.PathList object Assembly { - lazy val classifier: String = "with-dependencies" - - def projectSettings: Seq[Setting[_]] = + def projectSettings(assemblyName: Option[String] = None): Seq[Setting[_]] = Seq( assembly / assemblyMergeStrategy := { case "META-INF/services/org.apache.spark.sql.sources.DataSourceRegister" => @@ -19,14 +17,17 @@ object Assembly { case x => MergeStrategy.first }, // JAR file settings - assembly / assemblyJarName := s"${name.value}_${scalaBinaryVersion.value}_${version.value}-$classifier.jar" + assembly / assemblyJarName := { + val aName: String = assemblyName.getOrElse(name.value) + s"${aName}_${scalaBinaryVersion.value}.jar" + } ) def publishAssemblyJar: Seq[Setting[_]] = Seq( Compile / assembly / artifact := { val art: Artifact = (Compile / assembly / artifact).value - art.withClassifier(Some(classifier)) + art } ) ++ addArtifact(Compile / assembly / artifact, assembly) diff --git a/project/Dependency.scala b/project/Dependency.scala index 9adb912..02db466 100644 --- a/project/Dependency.scala +++ b/project/Dependency.scala @@ -1,5 +1,6 @@ import Library._ import sbt._ +import sbtassembly.AssemblyPlugin object Dependency { @@ -20,7 +21,7 @@ object Dependency { ) ++ testing lazy val serializer: Seq[ModuleID] = - Seq( + sparkD ++ Seq( circe.parser, circe.generic, circe.yaml, @@ -36,12 +37,30 @@ object Dependency { holdenkarau.sparktest ).map(d => d % "test") - lazy val api: Seq[ModuleID] = testing + lazy val api: Seq[ModuleID] = sparkD ++ testing + + lazy val sparkD: Seq[ModuleID] = Seq( + spark.core, + spark.sql + ) implicit class ProjectOps(val prj: Project) extends AnyVal { def withKindProjector: Project = prj.settings( addCompilerPlugin("org.typelevel" % "kind-projector" % "0.13.2" cross CrossVersion.full) ) + + def withNoAssembly: Project = prj.disablePlugins(AssemblyPlugin) + + def withAssembly: Project = + prj + .enablePlugins(AssemblyPlugin) + .settings(Assembly.projectSettings(None)) + + def withAssembly(name: String): Project = + prj + .enablePlugins(AssemblyPlugin) + .settings(Assembly.projectSettings(Some(name))) + } } diff --git a/project/Library.scala b/project/Library.scala index 42b55b7..f1f6d5d 100644 --- a/project/Library.scala +++ b/project/Library.scala @@ -3,9 +3,9 @@ import sbt._ object Library { object spark { - lazy val core: ModuleID = "org.apache.spark" %% "spark-core" % Version.Spark - lazy val sql: ModuleID = "org.apache.spark" %% "spark-sql" % Version.Spark - lazy val hadoopCloud: ModuleID = "org.apache.spark" %% "spark-hadoop-cloud" % Version.Spark + lazy val core: ModuleID = "org.apache.spark" %% "spark-core" % Version.Spark % Provided + lazy val sql: ModuleID = "org.apache.spark" %% "spark-sql" % Version.Spark % Provided + lazy val hadoopCloud: ModuleID = "org.apache.spark" %% "spark-hadoop-cloud" % Version.Spark % Provided } object cats { From ecf6a230ce771e3ab2ca2f337e467e4c886bf84b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Tue, 31 Dec 2024 11:01:46 +0000 Subject: [PATCH 14/20] Edit README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1bf9ef7..c8d3d7f 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,9 @@ sbt cli/assembly The resulting JAR, `teckel-etl_2.13.jar`, will be located in the `cli/target/scala-2.13/` directory. -> [!IMPORTANT] **Teckel CLI as dependency / Teckel ETL as framework.** +> [!IMPORTANT] +> +> **Teckel CLI as dependency / Teckel ETL as framework.** > > The Teckel CLI is a standalone application that can be used as a dependency in your project. Notice that the uber jar > name is `teckel-etl` and not `teckel-cli` or `teckel-cli-assembly`. This is because From 46831f5dc9d990349d16c026d6bb3dd554051fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Thu, 2 Jan 2025 20:15:53 +0100 Subject: [PATCH 15/20] Add/Edit API Methods (#28) --- README.md | 66 ++++++++++++++++++- .../com/eff3ct/teckel/api/core/ETL.scala | 5 +- .../com/eff3ct/teckel/api/core/Run.scala | 12 +++- .../scala/com/eff3ct/teckel/api/data.scala | 12 ++-- .../scala/com/eff3ct/teckel/api/package.scala | 4 ++ .../eff3ct/teckel/serializer/model/etl.scala | 3 +- 6 files changed, 94 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c8d3d7f..2568ea7 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ blog: [Big Data with Zero Code](https://blog.rafaelfernandez.dev/posts/big-data- In case of you don't have Apache Spark installed previously, you can deploy an Apache Spark cluster using the following docker image [ `eff3ct/spark:latest`](https://hub.docker.com/r/eff3ct/spark) available in -the [eff3ct0/spark-docker](https://github.com/eff3ct0/spark-docker) Github repository. +the [eff3ct0/spark-docker](https://github.com/eff3ct0/spark-docker) GitHub repository. ### Installation @@ -114,6 +114,70 @@ libraryDependencies += "com.eff3ct" %% "teckel-cli" % "" libraryDependencies += "com.eff3ct" %% "teckel-api" % "" ``` +#### Example: Running ETL in a Standalone Application + +```scala +import cats.effect.{ExitCode, IO, IOApp} +import com.eff3ct.teckel.api._ +import com.eff3ct.teckel.semantic.execution._ +import org.apache.spark.sql.SparkSession + +object Example extends IOApp { + + /** + * Name of the ETL + */ + + implicit val spark: SparkSession = ??? + + val data: String = + """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'"""".stripMargin + + + override def run(args: List[String]): IO[ExitCode] = + etl[IO, Unit](data).as(ExitCode.Success) +} +``` + +You can use either the `etl`, `etlIO` or `unsafeETL` methods to run the ETL from the api package. + +```scala +def etl[F[_] : Run, O: EvalContext](data: String): F[O] +def etl[F[_] : Run, O: EvalContext](data: ETL): F[O] + +def etlIO[O: EvalContext](data: String): IO[O] +def etlIO[O: EvalContext](data: ETL): IO[O] + +def unsafeETL[O: EvalContext](data: String): O +def unsafeETL[O: EvalContext](data: ETL): O +``` + +### The set of Evaluation Contexts + +The Teckel API offers the `EvalContext[T]`, a versatile construct designed to evaluate ETL contexts and provide results +of type `T`. This enables flexible evaluation strategies for ETL processes, with two primary derivations: + +- `EvalContext[Unit]`: This context executes the ETL process, performing all specified operations, and ultimately + produces the spected output files. It is ideal for scenarios where the primary objective is the completion of data + transformations and load operations. +- `EvalContext[Context[DataFrame]]`: This context evaluates the ETL instructions with a focus on debugging and analysis. + Instead of executing transformations outright, it returns a `Context[DataFrame]`, which maps ETL component names to + their corresponding DataFrames. This allows developers to inspect intermediate DataFrames, facilitating a deeper + understanding of the data flow and transformation logic within the ETL process. + ### As Framework Teckel can also be used as a framework in your Apache Spark project by including the Teckel ETL Uber JAR in your Apache diff --git a/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala b/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala index 7cd1e9f..cec8f08 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/core/ETL.scala @@ -28,14 +28,17 @@ import cats.Id import cats.effect.unsafe.implicits.global import cats.effect.{Concurrent, IO} import com.eff3ct.teckel.semantic.core.EvalContext +import com.eff3ct.teckel.serializer.model.etl.{ETL => ETLD} import fs2.io.file.{Files, Path} -object ETL { +private[api] object ETL { def apply[F[_]: Run]: Run[F] = Run[F] def unsafe[O: EvalContext](data: String): O = Run[Id].run(data) + def unsafe[O: EvalContext](data: ETLD): O = Run[Id].run(data) + def fromFile[F[_]: Files: Concurrent: Run, O: EvalContext](path: String): F[O] = Files[F].readUtf8(Path(path)).evalMap(Run[F].run[O]).compile.lastOrError diff --git a/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala b/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala index 175ccb4..61203a6 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/core/Run.scala @@ -27,7 +27,7 @@ package com.eff3ct.teckel.api.core import cats.effect.IO import cats.effect.unsafe.implicits.global import cats.implicits._ -import cats.{Id, MonadThrow} +import cats.{Id, Monad, MonadThrow} import com.eff3ct.teckel.semantic.core.EvalContext import com.eff3ct.teckel.serializer._ import com.eff3ct.teckel.serializer.model.etl._ @@ -35,6 +35,7 @@ import com.eff3ct.teckel.transform.Rewrite trait Run[F[_]] { def run[O: EvalContext](data: String): F[O] + def run[O: EvalContext](data: ETL): F[O] } object Run { @@ -47,10 +48,19 @@ object Run { etl <- MonadThrow[F].fromEither(Serializer[ETL].decode(data)) context = Rewrite.rewrite(etl) } yield EvalContext[O].eval(context) + + override def run[O: EvalContext](data: ETL): F[O] = + for { + etl <- Monad[F].pure(data) + context = Rewrite.rewrite(etl) + } yield EvalContext[O].eval(context) } implicit val runId: Run[Id] = new Run[Id] { override def run[O: EvalContext](data: String): Id[O] = Run[IO].run(data).unsafeRunSync() + + override def run[O: EvalContext](data: ETL): Id[O] = + Run[IO].run(data).unsafeRunSync() } } diff --git a/api/src/main/scala/com/eff3ct/teckel/api/data.scala b/api/src/main/scala/com/eff3ct/teckel/api/data.scala index 1ea044b..8f99da9 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/data.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/data.scala @@ -25,12 +25,16 @@ package com.eff3ct.teckel.api import cats.effect.IO -import com.eff3ct.teckel.api.core.{ETL, Run} +import com.eff3ct.teckel.api.core.{ETL => ETLC, Run} import com.eff3ct.teckel.semantic.core.EvalContext +import com.eff3ct.teckel.serializer.model.etl.ETL object data { - def etl[F[_]: Run, O: EvalContext](data: String): F[O] = ETL[F].run[O](data) - def etlIO[O: EvalContext](data: String): IO[O] = ETL[IO].run[O](data) - def unsafeETL[O: EvalContext](data: String): O = ETL.unsafe(data) + def etl[F[_]: Run, O: EvalContext](data: String): F[O] = ETLC[F].run[O](data) + def etl[F[_]: Run, O: EvalContext](data: ETL): F[O] = ETLC[F].run[O](data) + def etlIO[O: EvalContext](data: String): IO[O] = ETLC[IO].run[O](data) + def etlIO[O: EvalContext](data: ETL): IO[O] = ETLC[IO].run[O](data) + def unsafeETL[O: EvalContext](data: String): O = ETLC.unsafe(data) + def unsafeETL[O: EvalContext](data: ETL): O = ETLC.unsafe(data) } diff --git a/api/src/main/scala/com/eff3ct/teckel/api/package.scala b/api/src/main/scala/com/eff3ct/teckel/api/package.scala index c48ff98..cc7c15a 100644 --- a/api/src/main/scala/com/eff3ct/teckel/api/package.scala +++ b/api/src/main/scala/com/eff3ct/teckel/api/package.scala @@ -28,11 +28,15 @@ import cats.effect.IO import com.eff3ct.teckel.api.core.Run import com.eff3ct.teckel.api.{data => d} import com.eff3ct.teckel.semantic.core.EvalContext +import com.eff3ct.teckel.serializer.model.etl.ETL package object api { def etl[F[_]: Run, O: EvalContext](data: String): F[O] = d.etl[F, O](data) + def etl[F[_]: Run, O: EvalContext](data: ETL): F[O] = d.etl[F, O](data) def etlIO[O: EvalContext](data: String): IO[O] = d.etlIO[O](data) + def etlIO[O: EvalContext](data: ETL): IO[O] = d.etlIO[O](data) def unsafeETL[O: EvalContext](data: String): O = d.unsafeETL[O](data) + def unsafeETL[O: EvalContext](data: ETL): O = d.unsafeETL[O](data) } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala index 257e11a..817df42 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/etl.scala @@ -29,7 +29,8 @@ import com.eff3ct.teckel.serializer.model.input._ import com.eff3ct.teckel.serializer.model.output._ import com.eff3ct.teckel.serializer.model.transformation._ import io.circe.generic.auto._ -object etl { + +private[teckel] object etl { case class ETL( input: NonEmptyList[Input], From e460059550b02530f42f9374f29f5ef2a7d388cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:06:18 +0000 Subject: [PATCH 16/20] Add video demo in Readme --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 2568ea7..eab9523 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,12 @@ Once the `teckel-etl_2.13.jar`is ready, use it to execute ETL processes on Apach #### Example: Running ETL in Apache Spark using STDIN +
Demo - Teckel and Apache Spark by STDIN + +[![Teckel and Apache Spark by Yaml File](https://res.cloudinary.com/marcomontalbano/image/upload/v1735905159/video_to_markdown/images/youtube--eJwJIbNAtto-c05b58ac6eb4c4700831b2b3070cd403.jpg)](https://www.youtube.com/watch?v=oxNjnxIdbig "Teckel and Apache Spark by STDIN") + +
+ To run the ETL in the **console**, you can use the following command: ```bash @@ -91,6 +97,12 @@ EOF #### Example: Running ETL in Apache Spark using a file +
Demo - Teckel and Apache Spark by Yaml File + +[![Teckel and Apache Spark by Yaml File](https://res.cloudinary.com/marcomontalbano/image/upload/v1735905159/video_to_markdown/images/youtube--eJwJIbNAtto-c05b58ac6eb4c4700831b2b3070cd403.jpg)](https://www.youtube.com/watch?v=eJwJIbNAtto "Teckel and Apache Spark by Yaml File") + +
+ To run the ETL from a **file**, you can use the following command: ```bash From 2c95005a7129212a87c44071e27f0622d318b71f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:43:42 +0100 Subject: [PATCH 17/20] Add Documentation (#29) --- README.md | 160 +++++-------------------- docs/etl-grammar-ideas.md | 161 ------------------------- docs/formal-definition.md | 35 ------ docs/integration-apache-spark.md | 197 +++++++++++++++++++++++++++++++ docs/publish.md | 64 ---------- 5 files changed, 227 insertions(+), 390 deletions(-) delete mode 100644 docs/etl-grammar-ideas.md delete mode 100644 docs/formal-definition.md create mode 100644 docs/integration-apache-spark.md delete mode 100644 docs/publish.md diff --git a/README.md b/README.md index eab9523..da69730 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,23 @@ blog: [Big Data with Zero Code](https://blog.rafaelfernandez.dev/posts/big-data- - **Flexible Transformations:** Perform joins, aggregations, and selections with clear syntax. - **Spark Compatibility:** Leverage the power of Apache Spark for large-scale data processing. +## ETL Yaml Example + +Here's an example of a fully defined ETL configuration using a YAML file: + +### SQL ETL + +- Simple Example: [here](./docs/etl/simple.yaml) +- Complex Example: [here](./docs/etl/complex.yaml) +- Other Example: [here](./docs/etl/example.yaml) + +### SQL Transformations + +- `Select` Example: [here](./docs/etl/select.yaml) +- `Where` Example: [here](./docs/etl/where.yaml) +- `Group By` Example: [here](./docs/etl/group-by.yaml) +- `Order By` Example: [here](./docs/etl/order-by.yaml) + ## Getting Started ### Prerequisites @@ -51,15 +68,7 @@ sbt cli/assembly The resulting JAR, `teckel-etl_2.13.jar`, will be located in the `cli/target/scala-2.13/` directory. -> [!IMPORTANT] -> -> **Teckel CLI as dependency / Teckel ETL as framework.** -> -> The Teckel CLI is a standalone application that can be used as a dependency in your project. Notice that the uber jar -> name is `teckel-etl` and not `teckel-cli` or `teckel-cli-assembly`. This is because -> we want to distinguish between the Teckel CLI dependency and the ETL framework. - -### Usage in Apache Spark +### Usage in Apache Spark Ecosystem with the CLI Once the `teckel-etl_2.13.jar`is ready, use it to execute ETL processes on Apache Spark with the following arguments: @@ -109,131 +118,22 @@ To run the ETL from a **file**, you can use the following command: /opt/spark/bin/spark-submit --class com.eff3ct.teckel.app.Main teckel-etl_2.13.jar -f /path/to/etl/file.yaml ``` -## Integration with Apache Spark +> [!IMPORTANT] +> +> **Teckel CLI as dependency / Teckel ETL as framework.** +> +> The Teckel CLI is a standalone application that can be used as a dependency in your project. Notice that the uber jar +> name is `teckel-etl` and not `teckel-cli` or `teckel-cli-assembly`. This is because +> we want to distinguish between the Teckel CLI dependency and the ETL framework. +> +> Check the [Integration with Apache Spark](./docs/integration-apache-spark.md) documentation for more information. -### As Dependency +## Integration with Apache Spark Teckel can be integrated with Apache Spark easily just adding either the Teckel CLI or Teckel Api as a -dependency in your project. - -#### SBT - -In your `build.sbt` file, add the following dependency: - -```scala -libraryDependencies += "com.eff3ct" %% "teckel-cli" % "" -// or -libraryDependencies += "com.eff3ct" %% "teckel-api" % "" -``` - -#### Example: Running ETL in a Standalone Application - -```scala -import cats.effect.{ExitCode, IO, IOApp} -import com.eff3ct.teckel.api._ -import com.eff3ct.teckel.semantic.execution._ -import org.apache.spark.sql.SparkSession - -object Example extends IOApp { - - /** - * Name of the ETL - */ - - implicit val spark: SparkSession = ??? - - val data: String = - """input: - | - name: table1 - | format: csv - | path: 'data/csv/example.csv' - | options: - | header: true - | sep: '|' - | - | - |output: - | - name: table1 - | format: parquet - | mode: overwrite - | path: 'data/parquet/example'"""".stripMargin - - - override def run(args: List[String]): IO[ExitCode] = - etl[IO, Unit](data).as(ExitCode.Success) -} -``` - -You can use either the `etl`, `etlIO` or `unsafeETL` methods to run the ETL from the api package. +dependency in your project or using it as a framework in your Apache Spark project. -```scala -def etl[F[_] : Run, O: EvalContext](data: String): F[O] -def etl[F[_] : Run, O: EvalContext](data: ETL): F[O] - -def etlIO[O: EvalContext](data: String): IO[O] -def etlIO[O: EvalContext](data: ETL): IO[O] - -def unsafeETL[O: EvalContext](data: String): O -def unsafeETL[O: EvalContext](data: ETL): O -``` - -### The set of Evaluation Contexts - -The Teckel API offers the `EvalContext[T]`, a versatile construct designed to evaluate ETL contexts and provide results -of type `T`. This enables flexible evaluation strategies for ETL processes, with two primary derivations: - -- `EvalContext[Unit]`: This context executes the ETL process, performing all specified operations, and ultimately - produces the spected output files. It is ideal for scenarios where the primary objective is the completion of data - transformations and load operations. -- `EvalContext[Context[DataFrame]]`: This context evaluates the ETL instructions with a focus on debugging and analysis. - Instead of executing transformations outright, it returns a `Context[DataFrame]`, which maps ETL component names to - their corresponding DataFrames. This allows developers to inspect intermediate DataFrames, facilitating a deeper - understanding of the data flow and transformation logic within the ETL process. - -### As Framework - -Teckel can also be used as a framework in your Apache Spark project by including the Teckel ETL Uber JAR in your Apache -Spark ecosystem. - -Build the Teckel ETL CLI into an Uber JAR using the following command: - -```bash -sbt cli/assembly -``` - -#### Local Spark Environment Setup - -Copy the Teckel ETL Uber JAR to the `/opt/spark/jars/` directory in your Apache Spark ecosystem: - -```bash -cp cli/target/scala-2.13/teckel-etl_2.13.jar /opt/spark/jars/ -``` - -#### Docker Usage - -Mount the Teckel ETL Uber JAR in your Docker container: - -```bash -docker run -v ./cli/target/scala-2.13/teckel-etl_2.13.jar:/app/teckel-etl_2.13.jar -it eff3ct/spark:latest /bin/bash - -``` - -## ETL Yaml Example - -Here's an example of a fully defined ETL configuration using a YAML file: - -### SQL ETL - -- Simple Example: [here](./docs/etl/simple.yaml) -- Complex Example: [here](./docs/etl/complex.yaml) -- Other Example: [here](./docs/etl/example.yaml) - -### SQL Transformations - -- `Select` Example: [here](./docs/etl/select.yaml) -- `Where` Example: [here](./docs/etl/where.yaml) -- `Group By` Example: [here](./docs/etl/group-by.yaml) -- `Order By` Example: [here](./docs/etl/order-by.yaml) +Check the [Integration with Apache Spark](./docs/integration-apache-spark.md) documentation for more information. ## Development and Contribution diff --git a/docs/etl-grammar-ideas.md b/docs/etl-grammar-ideas.md deleted file mode 100644 index 30ad7db..0000000 --- a/docs/etl-grammar-ideas.md +++ /dev/null @@ -1,161 +0,0 @@ -# ETL Grammar Ideas - -version 1 - -```yaml -etl: - name: table1 - join: null - right: - - table2: - - 't2pk1:t1pk1' - - 't2pk2:t1pk2' - - table3: - - 't3pk1:t1pk1' - predicate: - - expr: expr -``` - -version 2 - -```yaml -- name: tableResult - join: - joinType: left - relation: - - table1: - - t1pk1: t2pk1 - - t1pk2: t2pk1 - - table2: - - t3pk1: t1pk1 - - t3pk2: t1pk1 - -- name: table2 - source: - path: 'prefix://bucket1/path1/path2' - -``` - -version 3 - -```txt -Source ::= Unknown | From -From ::= `From` | | -Input := `Input` -Output := `Output` - -// TODO: It need double-check and define correctly -Transformation ::= JoinOperation | GroupOperation | WindowOperation - -// Join -JoinOperation ::= `Join` -JoinType ::= `Inner` | `Left` | `Right` | `Cross` | ... -JoinRelation ::= `JoinRelation` [ ] -RelationField ::= `RelationField` - -// Group -GroupOperation ::= `Group` -By ::= `By` [Column] -Agg ::= `Agg` [Column] - -Select ::= `Select` [Column] -Where ::= `Where` [Column] - -// Type Alias -AssetRef := String -Format := String -SourceRef := String -Options := `Map` String String -Context := `Map` -``` diff --git a/docs/integration-apache-spark.md b/docs/integration-apache-spark.md new file mode 100644 index 0000000..0a6d452 --- /dev/null +++ b/docs/integration-apache-spark.md @@ -0,0 +1,197 @@ +# Integration with Apache Spark + +Teckel can be integrated with Apache Spark easily just adding either the Teckel CLI or Teckel Api as a +dependency in your project or using it as a framework in your Apache Spark project. + +--- + +## As Framework + +Teckel can also be used as a framework in your Apache Spark project by including the Teckel ETL Uber JAR in your Apache +Spark ecosystem. + +Build the Teckel ETL CLI into an Uber JAR using the following command: + +```bash +sbt cli/assembly +``` + +### Local Spark Environment Setup + +Copy the Teckel ETL Uber JAR to the `/path/to/teckel/` directory in your Apache Spark ecosystem: + +```bash +cp cli/target/scala-2.13/teckel-etl_2.13.jar /path/to/teckel/teckel-etl_2.13.jar +``` + +### Docker Usage + +Mount the Teckel ETL Uber JAR in your Docker container: + +```bash +docker run -v ./cli/target/scala-2.13/teckel-etl_2.13.jar:/path/to/teckel/teckel-etl_2.13.jar -it eff3ct/spark:latest /bin/bash + +``` + +### Execution using the CLI + +Once the `teckel-etl_2.13.jar`is ready, use it to execute ETL processes on Apache Spark with the following arguments: + +- `-f` or `--file`: The path to the ETL file. +- `-c` or `--console`: Run the ETL in the console. + +#### Example: Running ETL in Apache Spark using STDIN + +```bash +cat << EOF | /opt/spark/bin/spark-submit --class com.eff3ct.teckel.app.Main /path/to/teckel/teckel-etl_2.13.jar -c +input: + - name: table1 + format: csv + path: '/path/to/data/file.csv' + options: + header: true + sep: '|' + + +output: + - name: table1 + format: parquet + mode: overwrite + path: '/path/to/output/' +EOF +``` + +--- + +## As Dependency + +Teckel can be integrated with Apache Spark easily just adding either the Teckel CLI or Teckel Api as a +dependency in your project. + +### SBT + +In your `build.sbt` file, add the following dependency: + +```scala +libraryDependencies += "com.eff3ct" %% "teckel-cli" % "" +// or +libraryDependencies += "com.eff3ct" %% "teckel-api" % "" +``` + +### Examples + +In the following examples, we will see how to use the Teckel API to run ETL processes in a standalone application. Also, +you can check the [example](./example/src/main/scala/com/eff3ct/teckel/api/example) folder for more examples. + +#### Example: Running ETL in a Standalone Application + +```scala +import cats.effect.{ExitCode, IO, IOApp} +import com.eff3ct.teckel.api._ +import com.eff3ct.teckel.semantic.execution._ +import org.apache.spark.sql.SparkSession + +object Example extends IOApp { + + /** + * Name of the ETL + */ + + implicit val spark: SparkSession = ??? + + val data: String = + """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'"""".stripMargin + + + override def run(args: List[String]): IO[ExitCode] = + etl[IO, Unit](data).as(ExitCode.Success) +} +``` + +#### Example: Debugging ETL's DataFrames in a Standalone Application + +```scala +import cats.effect.{ExitCode, IO, IOApp} +import com.eff3ct.teckel.api._ +import com.eff3ct.teckel.model.Context +import com.eff3ct.teckel.semantic.evaluation._ +import org.apache.spark.sql.{DataFrame, SparkSession} + +object Example extends IOApp { + + /** + * Name of the ETL + */ + + implicit val spark: SparkSession = ??? + + val data: String = + """input: + | - name: table1 + | format: csv + | path: 'data/csv/example.csv' + | options: + | header: true + | sep: '|' + | + | + |output: + | - name: table1 + | format: parquet + | mode: overwrite + | path: 'data/parquet/example'"""".stripMargin + + + override def run(args: List[String]): IO[ExitCode] = + etl[IO, Context[DataFrame]](data) + .map { ctx => + ctx.foreach { case (tableName, df) => + println(s"Table: $tableName") + df.show(false) + } + } + .as(ExitCode.Success) +} +``` + +You can use either the `etl`, `etlIO` or `unsafeETL` methods to run the ETL from the api package. + +```scala +def etl[F[_] : Run, O: EvalContext](data: String): F[O] +def etl[F[_] : Run, O: EvalContext](data: ETL): F[O] + +def etlIO[O: EvalContext](data: String): IO[O] +def etlIO[O: EvalContext](data: ETL): IO[O] + +def unsafeETL[O: EvalContext](data: String): O +def unsafeETL[O: EvalContext](data: ETL): O +``` + +--- + +## The set of Evaluation Contexts + +The Teckel API offers the `EvalContext[T]`, a versatile construct designed to evaluate ETL contexts and provide results +of type `T`. This enables flexible evaluation strategies for ETL processes, with two primary derivations: + +- `EvalContext[Unit]`: This context executes the ETL process, performing all specified operations, and ultimately + produces the spected output files. It is ideal for scenarios where the primary objective is the completion of data + transformations and load operations. +- `EvalContext[Context[DataFrame]]`: This context evaluates the ETL instructions with a focus on debugging and analysis. + Instead of executing transformations outright, it returns a `Context[DataFrame]`, which maps ETL component names to + their corresponding DataFrames. This allows developers to inspect intermediate DataFrames, facilitating a deeper + understanding of the data flow and transformation logic within the ETL process. + diff --git a/docs/publish.md b/docs/publish.md deleted file mode 100644 index 9aac277..0000000 --- a/docs/publish.md +++ /dev/null @@ -1,64 +0,0 @@ -# Publishing in own local repository - -## Prerequisites - -In order to publish your project, you need to: - -- Add your own credentials to the `~/.sbt/.credentials` file. - -```text -realm=Sonatype Nexus Repository Manager -host= -user= -password= -``` - -- Add the repository configuration for every publish scope in `~/.sbt/.nexus-` file. - -```text -protocol=[http|https] -host= -port= -scope=[snapshot|releases|other] -repository=[maven-snapshot|maven-releases|other] -``` - -For this case, we have the following configuration: - -* Credentials for publishing to the` repository in `.sbt/.credentials` - -```text -realm=Sonatype Nexus Repository Manager -host=localhost -user=admin -password=admin -``` - -* Configuration for the `publish` scope in `.sbt/.nexus-releases` - -```text -protocol=http -host=localhost -port=9999 -scope=releases -repository=maven-releases -``` - -* Configuration for the `publish` scope in `.sbt/.nexus-snapshots` - -```text -protocol=http -host=localhost -port=9999 -scope=snapshots -repository=maven-snapshots -``` - -### Publish - -```shell -sbt clean -sbt publish -``` - -The first command cleans the build and the second command publishes the project to the Sonatype repository. \ No newline at end of file From 143f0218763248bfddf175dea55457691ec3a39b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:42:16 +0000 Subject: [PATCH 18/20] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index da69730..39ebc24 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Once the `teckel-etl_2.13.jar`is ready, use it to execute ETL processes on Apach
Demo - Teckel and Apache Spark by STDIN -[![Teckel and Apache Spark by Yaml File](https://res.cloudinary.com/marcomontalbano/image/upload/v1735905159/video_to_markdown/images/youtube--eJwJIbNAtto-c05b58ac6eb4c4700831b2b3070cd403.jpg)](https://www.youtube.com/watch?v=oxNjnxIdbig "Teckel and Apache Spark by STDIN") +[![Teckel and Apache Spark by Yaml File](https://res.cloudinary.com/marcomontalbano/image/upload/v1735905159/video_to_markdown/images/youtube--eJwJIbNAtto-c05b58ac6eb4c4700831b2b3070cd403.jpg)](https://www.youtube.com/watch?v=V9PzMdZ6u2U "Teckel and Apache Spark by STDIN")
From 9bcf9943fe507f8668e49593c5c8bd900397b5e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Tue, 28 Jan 2025 17:38:45 +0100 Subject: [PATCH 19/20] Join Feature (#33) --- .github/workflows/ci.yml | 18 +++- .github/workflows/release.yml | 18 ++++ README.md | 5 +- api/src/test/resources/data/csv/example-2.csv | 10 +++ api/src/test/resources/data/csv/example.csv | 58 ++++++------- api/src/test/resources/etl/group-by.yaml | 23 +++++ api/src/test/resources/etl/join.yaml | 33 +++++++ api/src/test/resources/etl/order-by.yaml | 22 +++++ api/src/test/resources/etl/select.yaml | 21 +++++ api/src/test/resources/etl/simple.yaml | 2 +- api/src/test/resources/etl/where.yaml | 19 ++++ .../com/eff3ct/teckel/api/ExampleSpec.scala | 60 ++++++++++--- .../eff3ct/teckel/api/SparkTestUtils.scala | 39 +++++++++ .../com/eff3ct/teckel/api/TestResources.scala | 24 +++++ build.sbt | 20 +++-- docs/etl/join.yaml | 56 ++++++++++++ .../com/eff3ct/teckel/model/Source.scala | 4 + .../com/eff3ct/teckel/model/package.scala | 10 ++- project/BuildPlugin.scala | 79 +---------------- project/Dependency.scala | 20 ----- project/Extension.scala | 47 ++++++++++ project/Header.scala | 37 ++++++++ project/plugins.sbt | 2 + .../teckel/semantic/core/EvalAsset.scala | 4 - .../teckel/semantic/core/Semantic.scala | 8 +- .../teckel/semantic/core/SemanticMany.scala | 37 ++++++++ .../eff3ct/teckel/semantic/core/package.scala | 7 ++ .../eff3ct/teckel/semantic/evaluation.scala | 66 ++++++++++++-- .../eff3ct/teckel/semantic/execution.scala | 15 ++-- .../teckel/semantic/sources/Debug.scala | 69 +++++++-------- .../eff3ct/teckel/semantic/sources/Exec.scala | 4 - .../teckel/semantic/sources/package.scala | 8 +- .../src/test/resources/data/csv/example-2.csv | 10 +++ .../src/test/resources/data/csv/example.csv | 58 ++++++------- .../eff3ct/teckel/semantic/DebugSource.scala | 65 +++++--------- .../teckel/semantic/EvalAssetDebugSpec.scala | 70 +++++++++++++++ .../teckel/semantic/TestResources.scala | 87 +++++++++++++++++++ .../teckel/serializer/model/operations.scala | 28 +++++- .../serializer/model/transformation.scala | 5 +- .../com/eff3ct/teckel/transform/Rewrite.scala | 8 ++ 40 files changed, 887 insertions(+), 289 deletions(-) create mode 100644 api/src/test/resources/data/csv/example-2.csv create mode 100644 api/src/test/resources/etl/group-by.yaml create mode 100644 api/src/test/resources/etl/join.yaml create mode 100644 api/src/test/resources/etl/order-by.yaml create mode 100644 api/src/test/resources/etl/select.yaml create mode 100644 api/src/test/resources/etl/where.yaml create mode 100644 api/src/test/scala/com/eff3ct/teckel/api/SparkTestUtils.scala create mode 100644 api/src/test/scala/com/eff3ct/teckel/api/TestResources.scala create mode 100644 docs/etl/join.yaml create mode 100644 project/Extension.scala create mode 100644 project/Header.scala create mode 100644 semantic/src/main/scala/com/eff3ct/teckel/semantic/core/SemanticMany.scala create mode 100644 semantic/src/main/scala/com/eff3ct/teckel/semantic/core/package.scala create mode 100644 semantic/src/test/resources/data/csv/example-2.csv create mode 100644 semantic/src/test/scala/com/eff3ct/teckel/semantic/EvalAssetDebugSpec.scala create mode 100644 semantic/src/test/scala/com/eff3ct/teckel/semantic/TestResources.scala diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b26b9fc..9e31057 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,8 +31,22 @@ jobs: - name: Create header and run scalafmt run: sbt headerCreateAll scalafmtAll - - name: Build and Test - run: sbt -v +test + - name: Build and Test with Coverage + run: sbt clean coverage test coverageReport coverageAggregate + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: eff3ct0/supabase-auth-scala + files: target/scala-*/scoverage-report/scoverage.xml + fail_ci_if_error: true + + - name: Upload test results + uses: actions/upload-artifact@v3 + with: + name: test-results + path: target/test-reports # Optional: This step uploads information to the GitHub dependency graph and unblocking Dependabot alerts for the repository #- name: Upload dependency graph diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6dc9eaa..f9d3ec1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,6 +16,24 @@ jobs: java-version: 11 cache: sbt - uses: sbt/setup-sbt@v1 + + - name: Build and Test with Coverage + run: sbt clean coverage test coverageReport coverageAggregate + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: eff3ct0/supabase-auth-scala + files: target/scala-*/scoverage-report/scoverage.xml + fail_ci_if_error: true + + - name: Upload test results + uses: actions/upload-artifact@v3 + with: + name: test-results + path: target/test-reports + - run: sbt ci-release env: PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} diff --git a/README.md b/README.md index 39ebc24..a516b50 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # Teckel -[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?branch=master)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +[![Release](https://github.com/rafafrdz/teckel/actions/workflows/release.yml/badge.svg?branch=master)](https://github.com/rafafrdz/teckel/actions/workflows/release.yml) +[![codecov](https://codecov.io/gh/eff3ct0/teckel/graph/badge.svg?token=24E1IZ0K2H)](https://codecov.io/gh/eff3ct0/teckel) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) Teckel is a framework designed to simplify the creation of Apache Spark ETL (Extract, Transform, Load) processes using YAML configuration files. This tool aims to standardize and streamline ETL workflow creation by @@ -34,6 +36,7 @@ Here's an example of a fully defined ETL configuration using a YAML file: - `Where` Example: [here](./docs/etl/where.yaml) - `Group By` Example: [here](./docs/etl/group-by.yaml) - `Order By` Example: [here](./docs/etl/order-by.yaml) +- `Join` Example: [here](./docs/etl/join.yaml) ## Getting Started diff --git a/api/src/test/resources/data/csv/example-2.csv b/api/src/test/resources/data/csv/example-2.csv new file mode 100644 index 0000000..1087e6a --- /dev/null +++ b/api/src/test/resources/data/csv/example-2.csv @@ -0,0 +1,10 @@ +ID|Date|Symbol|Adj Close|Close|High|Low|Open|Volume +1|2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 +2|2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 +3|2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 +4|2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 +5|2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 +6|2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 +7|2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 +8|2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 +9|2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 \ No newline at end of file diff --git a/api/src/test/resources/data/csv/example.csv b/api/src/test/resources/data/csv/example.csv index 4dd58b4..865fa02 100644 --- a/api/src/test/resources/data/csv/example.csv +++ b/api/src/test/resources/data/csv/example.csv @@ -1,29 +1,29 @@ -Date|Symbol|Adj Close|Close|High|Low|Open|Volume -2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 -2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 -2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 -2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 -2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 -2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 -2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 -2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 -2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 -2024-11-18|ZTS|176.4199981689453|176.4199981689453|177.1999969482422|173.66000366210938|174.1300048828125|3172900.0 -2024-11-19|ZTS|175.55999755859375|175.55999755859375|176.5|173.24000549316406|174.8800048828125|2208300.0 -2024-11-20|ZTS|175.6699981689453|175.6699981689453|177.41000366210938|173.8300018310547|176.4199981689453|2187300.0 -2024-11-21|ZTS|176.7100067138672|176.7100067138672|177.66000366210938|174.5500030517578|175.6999969482422|2019500.0 -2024-11-22|ZTS|176.9600067138672|176.9600067138672|178.07000732421875|176.27999877929688|176.35000610351562|1854600.0 -2024-11-25|ZTS|178.7100067138672|178.7100067138672|178.8000030517578|176.14999389648438|177.0|4558300.0 -2024-11-26|ZTS|175.6999969482422|175.6999969482422|178.64999389648438|174.83999633789062|178.4499969482422|2539600.0 -2024-11-27|ZTS|176.74000549316406|176.74000549316406|179.27000427246094|175.0|175.27000427246094|2315800.0 -2024-11-29|ZTS|175.25|175.25|177.80999755859375|175.24000549316406|176.92999267578125|1543400.0 -2024-12-02|ZTS|176.80999755859375|176.80999755859375|176.91000366210938|173.72999572753906|175.77999877929688|2391500.0 -2024-12-03|ZTS|176.94000244140625|176.94000244140625|181.39999389648438|176.55999755859375|176.7100067138672|2679000.0 -2024-12-04|ZTS|175.32000732421875|175.32000732421875|178.5|174.5399932861328|174.60000610351562|2687000.0 -2024-12-05|ZTS|174.77000427246094|174.77000427246094|176.52999877929688|173.72000122070312|175.27000427246094|2442000.0 -2024-12-06|ZTS|176.4600067138672|176.4600067138672|177.5500030517578|174.41000366210938|174.77000427246094|2551200.0 -2024-12-09|ZTS|178.14999389648438|178.14999389648438|179.77999877929688|175.0800018310547|175.8800048828125|2387300.0 -2024-12-10|ZTS|176.7100067138672|176.7100067138672|178.50999450683594|176.1999969482422|177.8800048828125|1678200.0 -2024-12-11|ZTS|177.1699981689453|177.1699981689453|178.3800048828125|175.80999755859375|176.25999450683594|1782400.0 -2024-12-12|ZTS|178.83999633789062|178.83999633789062|179.6999969482422|176.55999755859375|176.9499969482422|1936000.0 -2024-12-13|ZTS|178.17999267578125|178.17999267578125|181.85000610351562|176.6300048828125|178.97999572753906|1650300.0 +ID|Date|Symbol|Adj Close|Close|High|Low|Open|Volume +1|2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 +2|2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 +3|2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 +4|2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 +5|2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 +6|2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 +7|2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 +8|2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 +9|2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 +10|2024-11-18|ZTS|176.4199981689453|176.4199981689453|177.1999969482422|173.66000366210938|174.1300048828125|3172900.0 +11|2024-11-19|ZTS|175.55999755859375|175.55999755859375|176.5|173.24000549316406|174.8800048828125|2208300.0 +12|2024-11-20|ZTS|175.6699981689453|175.6699981689453|177.41000366210938|173.8300018310547|176.4199981689453|2187300.0 +13|2024-11-21|ZTS|176.7100067138672|176.7100067138672|177.66000366210938|174.5500030517578|175.6999969482422|2019500.0 +14|2024-11-22|ZTS|176.9600067138672|176.9600067138672|178.07000732421875|176.27999877929688|176.35000610351562|1854600.0 +15|2024-11-25|ZTS|178.7100067138672|178.7100067138672|178.8000030517578|176.14999389648438|177.0|4558300.0 +16|2024-11-26|ZTS|175.6999969482422|175.6999969482422|178.64999389648438|174.83999633789062|178.4499969482422|2539600.0 +17|2024-11-27|ZTS|176.74000549316406|176.74000549316406|179.27000427246094|175.0|175.27000427246094|2315800.0 +18|2024-11-29|ZTS|175.25|175.25|177.80999755859375|175.24000549316406|176.92999267578125|1543400.0 +19|2024-12-02|ZTS|176.80999755859375|176.80999755859375|176.91000366210938|173.72999572753906|175.77999877929688|2391500.0 +20|2024-12-03|ZTS|176.94000244140625|176.94000244140625|181.39999389648438|176.55999755859375|176.7100067138672|2679000.0 +21|2024-12-04|ZTS|175.32000732421875|175.32000732421875|178.5|174.5399932861328|174.60000610351562|2687000.0 +22|2024-12-05|ZTS|174.77000427246094|174.77000427246094|176.52999877929688|173.72000122070312|175.27000427246094|2442000.0 +23|2024-12-06|ZTS|176.4600067138672|176.4600067138672|177.5500030517578|174.41000366210938|174.77000427246094|2551200.0 +24|2024-12-09|ZTS|178.14999389648438|178.14999389648438|179.77999877929688|175.0800018310547|175.8800048828125|2387300.0 +25|2024-12-10|ZTS|176.7100067138672|176.7100067138672|178.50999450683594|176.1999969482422|177.8800048828125|1678200.0 +26|2024-12-11|ZTS|177.1699981689453|177.1699981689453|178.3800048828125|175.80999755859375|176.25999450683594|1782400.0 +27|2024-12-12|ZTS|178.83999633789062|178.83999633789062|179.6999969482422|176.55999755859375|176.9499969482422|1936000.0 +28|2024-12-13|ZTS|178.17999267578125|178.17999267578125|181.85000610351562|176.6300048828125|178.97999572753906|1650300.0 \ No newline at end of file diff --git a/api/src/test/resources/etl/group-by.yaml b/api/src/test/resources/etl/group-by.yaml new file mode 100644 index 0000000..88c29a3 --- /dev/null +++ b/api/src/test/resources/etl/group-by.yaml @@ -0,0 +1,23 @@ +input: + - name: table1 + format: csv + path: 'src/test/resources/data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: groupByTable1 + group: + from: table1 + by: + - Symbol + agg: + - sum(`adj close`) as TotalClose + - max(high) as Highest + - min(low) as Lowest +output: + - name: groupByTable1 + format: parquet + mode: overwrite + path: 'src/test/resources/data/parquet/example/group-by' \ No newline at end of file diff --git a/api/src/test/resources/etl/join.yaml b/api/src/test/resources/etl/join.yaml new file mode 100644 index 0000000..3765ee0 --- /dev/null +++ b/api/src/test/resources/etl/join.yaml @@ -0,0 +1,33 @@ +input: + - name: table1 + format: csv + path: 'src/test/resources/data/csv/example.csv' + options: + header: true + sep: '|' + + - name: table2 + format: csv + path: 'src/test/resources/data/csv/example-2.csv' + options: + header: true + sep: '|' + + +transformation: + + - name: joinTable1 + join: + left: table1 + right: + - name: table2 + type: inner + on: + - table1.id==table2.id + - table1.id>175 + +output: + - name: joinTable2 + format: parquet + mode: overwrite + path: 'src/test/resources/data/parquet/example/join' \ No newline at end of file diff --git a/api/src/test/resources/etl/order-by.yaml b/api/src/test/resources/etl/order-by.yaml new file mode 100644 index 0000000..aa3adab --- /dev/null +++ b/api/src/test/resources/etl/order-by.yaml @@ -0,0 +1,22 @@ +input: + - name: table1 + format: csv + path: 'src/test/resources/data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: orderByTable1 + order: + from: table1 + by: + - id + - date + order: Desc + +output: + - name: orderByTable1 + format: parquet + mode: overwrite + path: 'src/test/resources/data/parquet/example/order-by' \ No newline at end of file diff --git a/api/src/test/resources/etl/select.yaml b/api/src/test/resources/etl/select.yaml new file mode 100644 index 0000000..652b193 --- /dev/null +++ b/api/src/test/resources/etl/select.yaml @@ -0,0 +1,21 @@ +input: + - name: table1 + format: csv + path: 'src/test/resources/data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: selectTable1 + select: + from: table1 + columns: + - id + - date + +output: + - name: selectTable1 + format: parquet + mode: overwrite + path: 'src/test/resources/data/parquet/example/select' \ No newline at end of file diff --git a/api/src/test/resources/etl/simple.yaml b/api/src/test/resources/etl/simple.yaml index f0a4400..f329129 100644 --- a/api/src/test/resources/etl/simple.yaml +++ b/api/src/test/resources/etl/simple.yaml @@ -11,4 +11,4 @@ output: - name: table1 format: parquet mode: overwrite - path: 'src/test/resources/data/parquet/example' \ No newline at end of file + path: 'src/test/resources/data/parquet/example/simple' \ No newline at end of file diff --git a/api/src/test/resources/etl/where.yaml b/api/src/test/resources/etl/where.yaml new file mode 100644 index 0000000..5f86ff8 --- /dev/null +++ b/api/src/test/resources/etl/where.yaml @@ -0,0 +1,19 @@ +input: + - name: table1 + format: csv + path: 'src/test/resources/data/csv/example.csv' + options: + header: true + sep: '|' + +transformation: + - name: whereTable1 + where: + from: table1 + filter: Date > '2024-12-12' + +output: + - name: whereTable1 + format: parquet + mode: overwrite + path: 'src/test/resources/data/parquet/example/where' \ No newline at end of file diff --git a/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala index 18a0833..2c6e73b 100644 --- a/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala +++ b/api/src/test/scala/com/eff3ct/teckel/api/ExampleSpec.scala @@ -28,33 +28,69 @@ import cats.effect.IO import cats.effect.unsafe.implicits.global import com.eff3ct.teckel.api.file._ import com.eff3ct.teckel.semantic.execution._ -import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.functions._ import org.scalatest.flatspec.AnyFlatSpecLike import org.scalatest.matchers.should.Matchers -class ExampleSpec extends AnyFlatSpecLike with Matchers { - - def sparkBuilder(): SparkSession = { - val sparkConf: SparkConf = new SparkConf() - val master: String = sparkConf.get("spark.master", "local[*]") - val appName: String = sparkConf.get("spark.app.name", "spark-etl") - SparkSession.builder().config(sparkConf).master(master).appName(appName).getOrCreate() - } - - implicit val spark: SparkSession = sparkBuilder() +class ExampleSpec + extends AnyFlatSpecLike + with Matchers + with DataFrameSuiteBase + with SparkTestUtils + with TestResources { "ExampleSpec" should "work correctly in a ETL F using IO" in { noException should be thrownBy etl[IO, Unit]("src/test/resources/etl/simple.yaml") .unsafeRunSync() + spark.read.parquet("src/test/resources/data/parquet/example/simple") :===: Resources.input } it should "work correctly in a ETL IO" in { noException should be thrownBy etlIO[Unit]("src/test/resources/etl/simple.yaml").unsafeRunSync() + spark.read.parquet("src/test/resources/data/parquet/example/simple") :===: Resources.input } it should "work correctly in an unsafe ETL" in { noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/simple.yaml") + spark.read.parquet("src/test/resources/data/parquet/example/simple") :===: Resources.input } + it should "work correctly a select pipeline" in { + noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/select.yaml") + spark.read + .parquet("src/test/resources/data/parquet/example/select") :===: + Resources.input.select("id", "date") + } + + it should "work correctly a where pipeline" in { + noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/where.yaml") + spark.read + .parquet("src/test/resources/data/parquet/example/where") :===: + Resources.input.where("Date > '2024-12-12'") + } + + it should "work correctly a groupBy pipeline" in { + noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/group-by.yaml") + spark.read + .parquet("src/test/resources/data/parquet/example/group-by") :===: + Resources.input + .groupBy("Symbol") + .agg( + sum("Adj Close") as "TotalClose", + max("High") as "Highest", + min("Low") as "Lowest" + ) + } + + it should "work correctly a orderBy pipeline" in { + noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/order-by.yaml") + spark.read + .parquet("src/test/resources/data/parquet/example/order-by") :===: + Resources.input.orderBy("Id", "Date") + + } +// it should "work correctly a join pipeline" in { + // noException should be thrownBy unsafeETL[Unit]("src/test/resources/etl/join.yaml") + // } } diff --git a/api/src/test/scala/com/eff3ct/teckel/api/SparkTestUtils.scala b/api/src/test/scala/com/eff3ct/teckel/api/SparkTestUtils.scala new file mode 100644 index 0000000..05bc004 --- /dev/null +++ b/api/src/test/scala/com/eff3ct/teckel/api/SparkTestUtils.scala @@ -0,0 +1,39 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.api + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.{DataFrame, SparkSession} + +trait SparkTestUtils { + self: DataFrameSuiteBase => + + implicit lazy val sp: SparkSession = self.spark + + implicit class DataFrameAssert(df: DataFrame) { + def :===:(expected: DataFrame): Unit = + assertDataFrameEquals(df, expected) + } +} diff --git a/api/src/test/scala/com/eff3ct/teckel/api/TestResources.scala b/api/src/test/scala/com/eff3ct/teckel/api/TestResources.scala new file mode 100644 index 0000000..2d51a09 --- /dev/null +++ b/api/src/test/scala/com/eff3ct/teckel/api/TestResources.scala @@ -0,0 +1,24 @@ +package com.eff3ct.teckel.api + +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.DataFrame + +trait TestResources { + self: DataFrameSuiteBase with SparkTestUtils => + + object Resources { + val input: DataFrame = spark.read + .format("csv") + .option("header", "true") + .option("sep", "|") + .load("src/test/resources/data/csv/example.csv") + .as("table1") + + val input2: DataFrame = spark.read + .format("csv") + .option("header", "true") + .option("sep", "|") + .load("src/test/resources/data/csv/example-2.csv") + .as("table2") + } +} diff --git a/build.sbt b/build.sbt index d5a4572..881144d 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,4 @@ -import Dependency.ProjectOps +import Extension.ProjectOps lazy val root = (project in file(".")) @@ -26,7 +26,8 @@ lazy val model = name := "teckel-model", libraryDependencies ++= Dependency.model, publish / skip := false - ).withNoAssembly + ) + .withNoAssembly lazy val semantic = (project in file("./semantic")) @@ -35,8 +36,10 @@ lazy val semantic = name := "teckel-semantic", libraryDependencies ++= Dependency.semantic, publish / skip := false - ).withNoAssembly + ) + .withNoAssembly .withKindProjector + .withCoverage /** Serializer */ lazy val serializer = @@ -46,7 +49,9 @@ lazy val serializer = name := "teckel-serializer", publish / skip := false, libraryDependencies ++= Dependency.serializer - ).withNoAssembly + ) + .withNoAssembly + .withCoverage lazy val api = (project in file("./api")) @@ -55,7 +60,9 @@ lazy val api = name := "teckel-api", publish / skip := false, libraryDependencies ++= Dependency.api - ).withNoAssembly + ) + .withNoAssembly + .withCoverage lazy val cli = (project in file("./cli")) @@ -64,7 +71,8 @@ lazy val cli = name := "teckel-cli", publish / skip := false, libraryDependencies ++= Dependency.sparkD - ).withAssembly("teckel-etl") + ) + .withAssembly("teckel-etl") lazy val example = (project in file("./example")) diff --git a/docs/etl/join.yaml b/docs/etl/join.yaml new file mode 100644 index 0000000..10ac790 --- /dev/null +++ b/docs/etl/join.yaml @@ -0,0 +1,56 @@ +input: + - name: table1 + format: csv + path: 'data/csv/example.csv' + options: + header: true + sep: '|' + + - name: table2 + format: parquet + path: 'data/parquet/example2' + + - name: table3 + format: csv + path: 'data/csv/example3.csv' + options: + header: true + sep: '|' + + - name: table4 + format: parquet + path: 'data/parquet/example4' + +transformation: + + - name: joinTable1 + join: + left: table1 + right: + - name: table2 + type: inner + on: + - table1.col11==table2.col21 + - table1.col12>=table2.col22 + + - name: table3 + type: left + on: + - table1.col11==table3.col31 + - table1.col13==table3.col32 + + - name: joinTable2 + join: + left: table4 + right: + - name: joinTable1 + type: left + on: + - table4.col41==table1.col11 + - table4.col42!=table1.col14 + +output: + - name: joinTable2 + format: parquet + mode: overwrite + path: 'data/parquet/example' \ No newline at end of file diff --git a/model/src/main/scala/com/eff3ct/teckel/model/Source.scala b/model/src/main/scala/com/eff3ct/teckel/model/Source.scala index 09d432d..2a5339e 100644 --- a/model/src/main/scala/com/eff3ct/teckel/model/Source.scala +++ b/model/src/main/scala/com/eff3ct/teckel/model/Source.scala @@ -56,4 +56,8 @@ object Source { case class OrderBy(assetRef: AssetRef, by: NonEmptyList[Column], order: Option[Order]) extends Transformation + + case class Join(assetRef: AssetRef, others: NonEmptyList[Relation]) extends Transformation + + case class Relation(name: AssetRef, joinType: RelationType, on: List[Condition]) } diff --git a/model/src/main/scala/com/eff3ct/teckel/model/package.scala b/model/src/main/scala/com/eff3ct/teckel/model/package.scala index 4aa8cde..1939379 100644 --- a/model/src/main/scala/com/eff3ct/teckel/model/package.scala +++ b/model/src/main/scala/com/eff3ct/teckel/model/package.scala @@ -23,6 +23,7 @@ */ package com.eff3ct.teckel +import scala.collection.mutable.{Map => MMap} package object model { @@ -32,8 +33,11 @@ package object model { type Mode = String type Options = Map[String, String] type Context[T] = Map[AssetRef, T] + // TODO. Use a Effect Mutable State to keep track of the already evaluated assets + type Mutex[T] = MMap[AssetRef, T] - type Column = String - type Condition = String - type Order = String + type Column = String + type Condition = String + type Order = String + type RelationType = String } diff --git a/project/BuildPlugin.scala b/project/BuildPlugin.scala index fb9cf07..e880c2b 100644 --- a/project/BuildPlugin.scala +++ b/project/BuildPlugin.scala @@ -1,5 +1,5 @@ -import de.heikoseeberger.sbtheader.HeaderPlugin.autoImport.{HeaderLicense, headerLicense} -import de.heikoseeberger.sbtheader.{HeaderPlugin, License} +import de.heikoseeberger.sbtheader.HeaderPlugin +import de.heikoseeberger.sbtheader.HeaderPlugin.autoImport.headerLicense import sbt.Keys._ import sbt._ import sbt.plugins.JvmPlugin @@ -34,50 +34,7 @@ object BuildPlugin extends AutoPlugin { run / fork := true, Test / fork := true, Test / parallelExecution := false, - headerLicense := Some(headerIOLicense), - scalacOptions ++= Vector( -// "-release:11", - "-Ymacro-annotations", - "-deprecation", // Emit warnings for deprecated APIs. -// "-Ypartial-unification", // Just for Scala 2.12.x - Enable partial unification in type constructor inference -// "-Wnonunit-statement", // Just for Scala 2.13.x - Warn when a block that doesn't contain a statement (e.g. an if with an else clause without the else clause) evaluates to Unit. -// "-encoding:utf-8", // Specify character encoding used by source files. - "-explaintypes", // Explain type errors in more detail. - "-feature", // Emit warnings for features that should be imported explicitly. - "-language:existentials", // Allow existential types (besides wildcard types). - "-language:experimental.macros", // Allow macro definition (besides implementation and application). - "-language:higherKinds", // Allow higher-kinded types. - "-language:implicitConversions", // Allow definition of implicit functions called views. - "-unchecked", // Enable additional warnings where generated code depends on assumptions. - "-Xcheckinit", // Wrap field accessors to throw an exception on uninitialized access. - "-Xfatal-warnings", // Fail compilation if there are any warnings. - "-Xlint:adapted-args", // Warn if an argument list is modified to match the receiver. - "-Xlint:constant", // Warn if constant expressions evaluate to an error. - "-Xlint:delayedinit-select", // Warn about selecting members of DelayedInit. - "-Xlint:doc-detached", // Warn if Scaladoc comments appear detached from their element. - "-Xlint:inaccessible", // Warn about inaccessible types in method signatures. - "-Xlint:infer-any", // Warn when a type argument is inferred to be Any. - "-Xlint:missing-interpolator", // Warn if a string literal is missing an interpolator id. - "-Xlint:nullary-unit", // Warn if nullary methods return Unit. - "-Xlint:option-implicit", // Warn about implicit views in Option.apply. - "-Xlint:package-object-classes", // Warn if classes or objects are defined in package objects. - "-Xlint:poly-implicit-overload", // Warn if parameterized overloaded implicit methods are not visible as view bounds. - "-Xlint:private-shadow", // Warn if a private field shadows a superclass field. - "-Xlint:stars-align", // Warn if wildcard patterns do not align with sequence components. - "-Xlint:type-parameter-shadow", // Warn if a local type parameter shadows a type already in scope. -// "-Xlint:unsound-match", // Warn if a pattern match may not be typesafe. - "-Ywarn-dead-code", // Warn when dead code is identified. - "-Ywarn-extra-implicit", // Warn when more than one implicit parameter section is defined. - "-Ywarn-unused:implicits", // Warn if an implicit parameter is unused. -// "-Ywarn-unused:imports", // Warn if an import selector is not referenced. - "-Ywarn-unused:locals", // Warn if a local definition is unused. - "-Ywarn-unused:explicits", // Warn if a value parameter is unused. - "-Ywarn-unused:params", // Warn if a parameter is unused. - "-Ywarn-unused:patvars", // Warn if a variable bound in a pattern is unused. - "-Ywarn-unused:privates" // Warn if a private member is unused. -// "-Ywarn-macros:after" // Warn about macro annotations after expansion. -// "-Ymacro-annotations" // Scala 2.13.x - Allow the use of macro annotations. - ), + headerLicense := Some(Header.headerIOLicense), Compile / console / scalacOptions ~= (_.filterNot( Set("-Xfatal-warnings", "-Ywarn-unused:imports") )), @@ -90,34 +47,4 @@ object BuildPlugin extends AutoPlugin { Test / testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a") ) ++ SonatypePublish.projectSettings - /** - * SBT Header Plugin - */ - - lazy val headerText: String = - """|MIT License - | - |Copyright (c) 2024 Rafael Fernandez - | - |Permission is hereby granted, free of charge, to any person obtaining a copy - |of this software and associated documentation files (the "Software"), to deal - |in the Software without restriction, including without limitation the rights - |to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - |copies of the Software, and to permit persons to whom the Software is - |furnished to do so, subject to the following conditions: - | - |The above copyright notice and this permission notice shall be included in all - |copies or substantial portions of the Software. - | - |THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - |IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - |FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - |AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - |LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - |OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - |SOFTWARE. - |""".stripMargin - - lazy val headerIOLicense: License.Custom = - HeaderLicense.Custom(headerText) } diff --git a/project/Dependency.scala b/project/Dependency.scala index 02db466..69e2b5e 100644 --- a/project/Dependency.scala +++ b/project/Dependency.scala @@ -1,6 +1,5 @@ import Library._ import sbt._ -import sbtassembly.AssemblyPlugin object Dependency { @@ -44,23 +43,4 @@ object Dependency { spark.sql ) - implicit class ProjectOps(val prj: Project) extends AnyVal { - def withKindProjector: Project = prj.settings( - addCompilerPlugin("org.typelevel" % "kind-projector" % "0.13.2" cross CrossVersion.full) - ) - - def withNoAssembly: Project = prj.disablePlugins(AssemblyPlugin) - - def withAssembly: Project = - prj - .enablePlugins(AssemblyPlugin) - .settings(Assembly.projectSettings(None)) - - def withAssembly(name: String): Project = - prj - .enablePlugins(AssemblyPlugin) - .settings(Assembly.projectSettings(Some(name))) - - } - } diff --git a/project/Extension.scala b/project/Extension.scala new file mode 100644 index 0000000..4e0628d --- /dev/null +++ b/project/Extension.scala @@ -0,0 +1,47 @@ +import sbt._ +import sbtassembly.AssemblyPlugin +import scoverage.ScoverageKeys._ +import scoverage._ + +object Extension { + + implicit class ProjectOps(project: Project) { + + def withNoAssembly: Project = project.disablePlugins(AssemblyPlugin) + + def withAssembly: Project = + project + .enablePlugins(AssemblyPlugin) + .settings(Assembly.projectSettings(None)) + + def withAssembly(name: String): Project = + project + .enablePlugins(AssemblyPlugin) + .settings(Assembly.projectSettings(Some(name))) + + def withKindProjector: Project = + project.settings( + Seq( + addCompilerPlugin("org.typelevel" %% "kind-projector" % "0.13.3" cross CrossVersion.full) + ) + ) + + def withBetterMonadicFor: Project = + project.settings( + Seq( + addCompilerPlugin("com.olegpy" %% "better-monadic-for" % "0.3.1") + ) + ) + + def withCoverage: Project = + project + .enablePlugins(ScoverageSbtPlugin) + .settings( + coverageEnabled := true, + coverageFailOnMinimum := false, + coverageMinimumStmtTotal := 30 // TODO. provisional + ) + + } + +} diff --git a/project/Header.scala b/project/Header.scala new file mode 100644 index 0000000..f14e6a2 --- /dev/null +++ b/project/Header.scala @@ -0,0 +1,37 @@ +import de.heikoseeberger.sbtheader.HeaderPlugin.autoImport.HeaderLicense +import de.heikoseeberger.sbtheader.License + +object Header { + + /** + * SBT Header Plugin + */ + + lazy val headerText: String = + """MIT License + | + |Copyright (c) 2024 Rafael Fernandez + | + |Permission is hereby granted, free of charge, to any person obtaining a copy + |of this software and associated documentation files (the "Software"), to deal + |in the Software without restriction, including without limitation the rights + |to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + |copies of the Software, and to permit persons to whom the Software is + |furnished to do so, subject to the following conditions: + | + |The above copyright notice and this permission notice shall be included in all + |copies or substantial portions of the Software. + | + |THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + |IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + |FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + |AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + |LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + |OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + |SOFTWARE. + |""".stripMargin + + lazy val headerIOLicense: License.Custom = + HeaderLicense.Custom(headerText) + +} diff --git a/project/plugins.sbt b/project/plugins.sbt index 0dabbf1..d6f359d 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,4 +1,6 @@ /** Compiler */ +//addSbtPlugin("org.typelevel" % "sbt-tpolecat" % "0.5.2") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "2.3.0") addCompilerPlugin("com.olegpy" %% "better-monadic-for" % "0.3.1") addCompilerPlugin("org.typelevel" % "kind-projector" % "0.13.2" cross CrossVersion.full) diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala index b7aa885..491218d 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/EvalAsset.scala @@ -24,10 +24,6 @@ package com.eff3ct.teckel.semantic.core -import com.eff3ct.teckel.model._ - -trait EvalAsset[+T] extends Semantic[Asset, Context[Asset], T] - object EvalAsset { def apply[T: EvalAsset]: EvalAsset[T] = implicitly[EvalAsset[T]] diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala index 364cf8d..b12d6d6 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/Semantic.scala @@ -24,7 +24,7 @@ package com.eff3ct.teckel.semantic.core -trait Semantic[-S, -I, +O] { +trait Semantic[S, -I, +O] { def eval(input: I, source: S): O } @@ -32,7 +32,11 @@ object Semantic { def apply[S, I, O](implicit S: Semantic[S, I, O]): Semantic[S, I, O] = S - def pure[S, I, O](f: S => O): Semantic[S, I, O] = (_: I, source: S) => f(source) + def zero[S, I, O](f: S => O): Semantic[S, I, O] = (_: I, source: S) => f(source) + + def map[S, I, O](f: I => O): Semantic[S, I, O] = Semantic.pure((input: I, _) => f(input)) + + def pure[S, I, O](f: (I, S) => O): Semantic[S, I, O] = (input: I, source: S) => f(input, source) def any[S, O](f: S)(implicit S: Semantic[S, Any, O]): O = S.eval((), f) } diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/SemanticMany.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/SemanticMany.scala new file mode 100644 index 0000000..187f334 --- /dev/null +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/SemanticMany.scala @@ -0,0 +1,37 @@ +/* + * MIT License + * + * Copyright (c) 2024 Rafael Fernandez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.eff3ct.teckel.semantic.core + +import com.eff3ct.teckel.model.Context + +trait SemanticMany[S, I, +O] { + def eval(source: S, input: I, others: Context[I]): O +} + +object SemanticMany { + + def apply[S, I, O](implicit S: SemanticMany[S, I, O]): SemanticMany[S, I, O] = S + +} diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/package.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/package.scala new file mode 100644 index 0000000..e7d5166 --- /dev/null +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/core/package.scala @@ -0,0 +1,7 @@ +package com.eff3ct.teckel.semantic + +import com.eff3ct.teckel.model.{Asset, Context} + +package object core { + type EvalAsset[+T] = Semantic[Asset, Context[Asset], T] +} diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala index 1d09568..134cdf4 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/evaluation.scala @@ -27,24 +27,74 @@ package com.eff3ct.teckel.semantic import com.eff3ct.teckel.model.Source._ import com.eff3ct.teckel.model._ import com.eff3ct.teckel.semantic.core._ -import com.eff3ct.teckel.semantic.sources.Debug -import com.eff3ct.teckel.semantic.sources.Debug._ +import com.eff3ct.teckel.semantic.sources._ import org.apache.spark.sql._ +import scala.collection.mutable.{Map => MMap} + object evaluation { + private def register( + context: Mutex[DataFrame], + a: Asset, + df: DataFrame + ): DataFrame = { + context.put(a.assetRef, df) + df + } + implicit def debug(implicit S: SparkSession): EvalAsset[DataFrame] = new EvalAsset[DataFrame] { + val global: Mutex[DataFrame] = MMap() override def eval(context: Context[Asset], asset: Asset): DataFrame = { - asset.source match { - case s: Input => Semantic.any[Input, DataFrame](s) - case s: Output => Debug[Output].debug(eval(context, context(s.assetRef)), s) - case s: Transformation => - Debug[Transformation].debug(eval(context, context(s.assetRef)), s) - } + val registerCallBack: DataFrame => DataFrame = register(global, asset, _) + val getTableCallBack: Asset => Option[DataFrame] = a => global.get(a.assetRef) + resolveAndRegister(context, asset, eval, getTableCallBack, registerCallBack) } } + def resolve( + context: Context[Asset], + asset: Asset, + getOrEval: (Context[Asset], Asset) => DataFrame + )(implicit S: SparkSession): DataFrame = + asset.source match { + case s: Input => Debug.input(s).as(asset.assetRef) + + case s: Output => + val inner = getOrEval(context, context(s.assetRef)) + val result = Debug.output(inner).as(asset.assetRef) + result + + case s: Transformation => + lazy val diffContext: Context[Asset] = + context.filterNot { case (ref, _) => ref == asset.assetRef } + + lazy val others: Context[DataFrame] = + diffContext.map { case (ref, other) => + ref -> getOrEval(diffContext, other) + } + + val inner = getOrEval(context, context(s.assetRef)) + val result = Debug.transformation(s, inner, others).as(s.assetRef) + result + + } + + def resolveAndRegister( + context: Context[Asset], + asset: Asset, + evalCallBack: (Context[Asset], Asset) => DataFrame, + getTableCallBack: Asset => Option[DataFrame], + registerCallBack: DataFrame => DataFrame + )(implicit S: SparkSession): DataFrame = { + + val getOrEval: (Context[Asset], Asset) => DataFrame = + (context, asset) => + registerCallBack(getTableCallBack(asset).getOrElse(evalCallBack(context, asset))) + val df = resolve(context, asset, getOrEval) + registerCallBack(df) + } implicit def debugContext[T: EvalAsset]: EvalContext[Context[T]] = (context: Context[Asset]) => context.map { case (ref, asset) => diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala index 9a35562..c2467fb 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/execution.scala @@ -35,21 +35,20 @@ object execution { implicit def exec(implicit S: SparkSession): EvalAsset[Unit] = (context: Context[Asset], asset: Asset) => { - asset.source match { - case o: Output => - val EA: EvalAsset[DataFrame] = debug - Exec[Output].eval( - EA.eval(context, asset), - o - ) // TODO: Check if the asset is already evaluated + asset match { + case Asset(_, o: Output) => + val EA = debug(S) + Exec[Output].eval(EA.eval(context, asset), o) + case _ => () + } } implicit def execContext(implicit E: EvalAsset[Unit]): EvalContext[Unit] = (context: Context[Asset]) => context.foreach { - case (ref, asset @ Asset(_, _: Output)) => + case (ref, asset @ Asset(_, source: Output)) => ref -> EvalAsset[Unit].eval(context, asset) case _ => () } diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala index a79406d..6d2ea97 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Debug.scala @@ -25,57 +25,58 @@ package com.eff3ct.teckel.semantic.sources import cats.data.NonEmptyList +import com.eff3ct.teckel.model.Context import com.eff3ct.teckel.model.Source._ -import com.eff3ct.teckel.semantic.SemanticA -import com.eff3ct.teckel.semantic.core.Semantic import org.apache.spark.sql.functions.expr import org.apache.spark.sql.{DataFrame, RelationalGroupedDataset, SparkSession} -trait Debug[S] extends Semantic[S, DataFrame, DataFrame] { - def debug(df: DataFrame, source: S): DataFrame = eval(df, source) -} - object Debug { - def apply[S: Debug]: Debug[S] = implicitly[Debug[S]] - implicit def input[S <: Input](implicit S: SparkSession): SemanticA[S, DataFrame] = - Semantic.pure((source: S) => - S.read.format(source.format).options(source.options).load(source.sourceRef) - ) + def input[S <: Input](source: S)(implicit S: SparkSession): DataFrame = + S.read.format(source.format).options(source.options).load(source.sourceRef) - implicit val output: Debug[Output] = - (df, _) => df + def output[S <: Output]: DataFrame => DataFrame = identity[DataFrame] /** Transformation */ - implicit val transformation: Debug[Transformation] = - (df, source) => - source match { - case s: Select => Debug[Select].debug(df, s) - case s: Where => Debug[Where].debug(df, s) - case s: GroupBy => Debug[GroupBy].debug(df, s) - case s: OrderBy => Debug[OrderBy].debug(df, s) - } + def transformation(source: Transformation, df: DataFrame, others: => Context[DataFrame]): DataFrame = + source match { + case s: Select => select(df, s) + case s: Where => where(df, s) + case s: GroupBy => groupBy(df, s) + case s: OrderBy => orderBy(df, s) + case s: Join => join(s, df, others) + } /** Select */ - implicit val select: Debug[Select] = - (df, source) => df.select(source.columns.toList.map(df(_)): _*) + def select[S <: Select](df: DataFrame, source: S): DataFrame = + df.select(source.columns.toList.map(df(_)): _*) /** Where */ - implicit val whereS: Debug[Where] = - (df, source) => df.where(source.condition) + def where[S <: Where](df: DataFrame, source: S): DataFrame = + df.where(source.condition) /** GroupBy */ - implicit val groupByS: Debug[GroupBy] = - (df, source) => { - val relDF: RelationalGroupedDataset = df.groupBy(source.by.toList.map(df(_)): _*) - source.aggregate match { - case NonEmptyList(a, Nil) => relDF.agg(expr(a)) - case NonEmptyList(a, tail) => relDF.agg(expr(a), tail.map(expr): _*) - } + def groupBy[S <: GroupBy](df: DataFrame, source: S): DataFrame = { + val relDF: RelationalGroupedDataset = df.groupBy(source.by.toList.map(df(_)): _*) + source.aggregate match { + case NonEmptyList(a, Nil) => relDF.agg(expr(a)) + case NonEmptyList(a, tail) => relDF.agg(expr(a), tail.map(expr): _*) } + } /** OrderBy */ // TODO: implement the asc/desc order - implicit val orderByS: Debug[OrderBy] = - (df, source) => df.orderBy(source.by.toList.map(df(_)): _*) + def orderBy[S <: OrderBy](df: DataFrame, source: S): DataFrame = + df.orderBy(source.by.toList.map(df(_)): _*) + + /** Join */ + def join[S <: Join](source: S, df: DataFrame, context: Context[DataFrame]): DataFrame = { + val relations: NonEmptyList[(Relation, DataFrame)] = + source.others.map(other => other -> context(other.name)) + + relations.foldLeft(df) { case (left, (relation, right)) => + val condition = relation.on.map(cond => expr(cond)).reduce(_ && _) + left.join(right, condition, relation.joinType) + } + } } diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala index 6eb81c2..2fd4237 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/Exec.scala @@ -25,10 +25,6 @@ package com.eff3ct.teckel.semantic.sources import com.eff3ct.teckel.model.Source.Output -import com.eff3ct.teckel.semantic.core.Semantic -import org.apache.spark.sql.DataFrame - -trait Exec[-S] extends Semantic[S, DataFrame, Unit] object Exec { def apply[S: Exec]: Exec[S] = implicitly[Exec[S]] diff --git a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala index 05133b5..a007127 100644 --- a/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala +++ b/semantic/src/main/scala/com/eff3ct/teckel/semantic/sources/package.scala @@ -24,11 +24,15 @@ package com.eff3ct.teckel.semantic +import com.eff3ct.teckel.semantic.core.{Semantic, SemanticMany} import org.apache.spark.sql.DataFrame package object sources { - def debug[S: Debug]: (DataFrame, S) => DataFrame = Debug[S].debug + type Debug[S] = Semantic[S, DataFrame, DataFrame] + type DebugMany[S] = SemanticMany[S, DataFrame, DataFrame] + type Exec[S] = Semantic[S, DataFrame, Unit] + + def DebugMany[S: DebugMany]: DebugMany[S] = implicitly[DebugMany[S]] - def exec[S: Exec]: (DataFrame, S) => Unit = Exec[S].eval } diff --git a/semantic/src/test/resources/data/csv/example-2.csv b/semantic/src/test/resources/data/csv/example-2.csv new file mode 100644 index 0000000..1087e6a --- /dev/null +++ b/semantic/src/test/resources/data/csv/example-2.csv @@ -0,0 +1,10 @@ +ID|Date|Symbol|Adj Close|Close|High|Low|Open|Volume +1|2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 +2|2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 +3|2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 +4|2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 +5|2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 +6|2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 +7|2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 +8|2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 +9|2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 \ No newline at end of file diff --git a/semantic/src/test/resources/data/csv/example.csv b/semantic/src/test/resources/data/csv/example.csv index 4dd58b4..865fa02 100644 --- a/semantic/src/test/resources/data/csv/example.csv +++ b/semantic/src/test/resources/data/csv/example.csv @@ -1,29 +1,29 @@ -Date|Symbol|Adj Close|Close|High|Low|Open|Volume -2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 -2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 -2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 -2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 -2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 -2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 -2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 -2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 -2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 -2024-11-18|ZTS|176.4199981689453|176.4199981689453|177.1999969482422|173.66000366210938|174.1300048828125|3172900.0 -2024-11-19|ZTS|175.55999755859375|175.55999755859375|176.5|173.24000549316406|174.8800048828125|2208300.0 -2024-11-20|ZTS|175.6699981689453|175.6699981689453|177.41000366210938|173.8300018310547|176.4199981689453|2187300.0 -2024-11-21|ZTS|176.7100067138672|176.7100067138672|177.66000366210938|174.5500030517578|175.6999969482422|2019500.0 -2024-11-22|ZTS|176.9600067138672|176.9600067138672|178.07000732421875|176.27999877929688|176.35000610351562|1854600.0 -2024-11-25|ZTS|178.7100067138672|178.7100067138672|178.8000030517578|176.14999389648438|177.0|4558300.0 -2024-11-26|ZTS|175.6999969482422|175.6999969482422|178.64999389648438|174.83999633789062|178.4499969482422|2539600.0 -2024-11-27|ZTS|176.74000549316406|176.74000549316406|179.27000427246094|175.0|175.27000427246094|2315800.0 -2024-11-29|ZTS|175.25|175.25|177.80999755859375|175.24000549316406|176.92999267578125|1543400.0 -2024-12-02|ZTS|176.80999755859375|176.80999755859375|176.91000366210938|173.72999572753906|175.77999877929688|2391500.0 -2024-12-03|ZTS|176.94000244140625|176.94000244140625|181.39999389648438|176.55999755859375|176.7100067138672|2679000.0 -2024-12-04|ZTS|175.32000732421875|175.32000732421875|178.5|174.5399932861328|174.60000610351562|2687000.0 -2024-12-05|ZTS|174.77000427246094|174.77000427246094|176.52999877929688|173.72000122070312|175.27000427246094|2442000.0 -2024-12-06|ZTS|176.4600067138672|176.4600067138672|177.5500030517578|174.41000366210938|174.77000427246094|2551200.0 -2024-12-09|ZTS|178.14999389648438|178.14999389648438|179.77999877929688|175.0800018310547|175.8800048828125|2387300.0 -2024-12-10|ZTS|176.7100067138672|176.7100067138672|178.50999450683594|176.1999969482422|177.8800048828125|1678200.0 -2024-12-11|ZTS|177.1699981689453|177.1699981689453|178.3800048828125|175.80999755859375|176.25999450683594|1782400.0 -2024-12-12|ZTS|178.83999633789062|178.83999633789062|179.6999969482422|176.55999755859375|176.9499969482422|1936000.0 -2024-12-13|ZTS|178.17999267578125|178.17999267578125|181.85000610351562|176.6300048828125|178.97999572753906|1650300.0 +ID|Date|Symbol|Adj Close|Close|High|Low|Open|Volume +1|2024-11-05|ZTS|175.27|175.27|176.80|172.25|174.55|2453800.0 +2|2024-11-06|ZTS|170.3699951171875|170.3699951171875|178.9199981689453|169.30999755859375|178.52000427246094|5362100.0 +3|2024-11-07|ZTS|174.25|174.25|174.7899932861328|169.63999938964844|172.58999633789062|3781300.0 +4|2024-11-08|ZTS|176.82000732421875|176.82000732421875|177.10000610351562|173.22000122070312|174.25|3243400.0 +5|2024-11-11|ZTS|176.14999389648438|176.14999389648438|178.3800048828125|175.0|176.92999267578125|3399500.0 +6|2024-11-12|ZTS|173.9600067138672|173.9600067138672|176.50999450683594|173.75|175.38999938964844|2704100.0 +7|2024-11-13|ZTS|177.0399932861328|177.0399932861328|177.5|174.91000366210938|175.32000732421875|2375300.0 +8|2024-11-14|ZTS|174.6300048828125|174.6300048828125|178.97000122070312|173.80999755859375|177.47999572753906|3009800.0 +9|2024-11-15|ZTS|175.13999938964844|175.13999938964844|177.07000732421875|170.75|173.0|3426500.0 +10|2024-11-18|ZTS|176.4199981689453|176.4199981689453|177.1999969482422|173.66000366210938|174.1300048828125|3172900.0 +11|2024-11-19|ZTS|175.55999755859375|175.55999755859375|176.5|173.24000549316406|174.8800048828125|2208300.0 +12|2024-11-20|ZTS|175.6699981689453|175.6699981689453|177.41000366210938|173.8300018310547|176.4199981689453|2187300.0 +13|2024-11-21|ZTS|176.7100067138672|176.7100067138672|177.66000366210938|174.5500030517578|175.6999969482422|2019500.0 +14|2024-11-22|ZTS|176.9600067138672|176.9600067138672|178.07000732421875|176.27999877929688|176.35000610351562|1854600.0 +15|2024-11-25|ZTS|178.7100067138672|178.7100067138672|178.8000030517578|176.14999389648438|177.0|4558300.0 +16|2024-11-26|ZTS|175.6999969482422|175.6999969482422|178.64999389648438|174.83999633789062|178.4499969482422|2539600.0 +17|2024-11-27|ZTS|176.74000549316406|176.74000549316406|179.27000427246094|175.0|175.27000427246094|2315800.0 +18|2024-11-29|ZTS|175.25|175.25|177.80999755859375|175.24000549316406|176.92999267578125|1543400.0 +19|2024-12-02|ZTS|176.80999755859375|176.80999755859375|176.91000366210938|173.72999572753906|175.77999877929688|2391500.0 +20|2024-12-03|ZTS|176.94000244140625|176.94000244140625|181.39999389648438|176.55999755859375|176.7100067138672|2679000.0 +21|2024-12-04|ZTS|175.32000732421875|175.32000732421875|178.5|174.5399932861328|174.60000610351562|2687000.0 +22|2024-12-05|ZTS|174.77000427246094|174.77000427246094|176.52999877929688|173.72000122070312|175.27000427246094|2442000.0 +23|2024-12-06|ZTS|176.4600067138672|176.4600067138672|177.5500030517578|174.41000366210938|174.77000427246094|2551200.0 +24|2024-12-09|ZTS|178.14999389648438|178.14999389648438|179.77999877929688|175.0800018310547|175.8800048828125|2387300.0 +25|2024-12-10|ZTS|176.7100067138672|176.7100067138672|178.50999450683594|176.1999969482422|177.8800048828125|1678200.0 +26|2024-12-11|ZTS|177.1699981689453|177.1699981689453|178.3800048828125|175.80999755859375|176.25999450683594|1782400.0 +27|2024-12-12|ZTS|178.83999633789062|178.83999633789062|179.6999969482422|176.55999755859375|176.9499969482422|1936000.0 +28|2024-12-13|ZTS|178.17999267578125|178.17999267578125|181.85000610351562|176.6300048828125|178.97999572753906|1650300.0 \ No newline at end of file diff --git a/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala b/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala index 7d56bc7..781e968 100644 --- a/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala +++ b/semantic/src/test/scala/com/eff3ct/teckel/semantic/DebugSource.scala @@ -24,13 +24,8 @@ package com.eff3ct.teckel.semantic -import cats.data.NonEmptyList -import com.eff3ct.teckel.model.Source._ -import com.eff3ct.teckel.semantic.core.Semantic import com.eff3ct.teckel.semantic.sources.Debug -import com.eff3ct.teckel.semantic.sources.Debug._ import com.holdenkarau.spark.testing._ -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.scalatest.flatspec.AnyFlatSpecLike import org.scalatest.matchers.should.Matchers @@ -39,61 +34,29 @@ class DebugSource extends AnyFlatSpecLike with Matchers with DataFrameSuiteBase - with SparkTestUtils { + with SparkTestUtils + with TestResources { - object Resources { - val input: DataFrame = spark.read - .format("csv") - .option("header", "true") - .option("sep", "|") - .load("src/test/resources/data/csv/example.csv") - } - - object Sources { - - val input: Input = - Input("csv", Map("header" -> "true", "sep" -> "|"), "src/test/resources/data/csv/example.csv") - - val output: Output = - Output("table1", "parquet", "overwrite", Map(), "src/test/resources/data/parquet/example") - - val select: Select = Select("table1", NonEmptyList.of("Symbol", "Date")) - - val where: Where = Where("table1", "Date > '2024-12-12'") - - val groupBy: GroupBy = GroupBy( - "table1", - NonEmptyList.of("Symbol"), - NonEmptyList.of( - "sum(`Adj Close`) as TotalClose", - "max(High) as Highest", - "min(Low) as Lowest" - ) - ) - - val orderBy: OrderBy = OrderBy("table1", NonEmptyList.of("High"), Some("Asc")) - - } "DebugSource" should "debug an input source" in { - Semantic.any[Input, DataFrame](Sources.input) :===: Resources.input + Debug.input(Sources.input) :===: Resources.input } it should "debug an output source" in { - Debug[Output].debug(Resources.input, Sources.output) :===: Resources.input + Debug.output(Resources.input) :===: Resources.input } it should "debug a select transformation" in { - Debug[Select].debug(Resources.input, Sources.select) :===: + Debug.select(Resources.input, Sources.select) :===: Resources.input.select("Symbol", "Date") } it should "debug a where transformation" in { - Debug[Where].debug(Resources.input, Sources.where) :===: + Debug.where(Resources.input, Sources.where) :===: Resources.input.where("Date > '2024-12-12'") } it should "debug a groupBy transformation" in { - Debug[GroupBy].debug(Resources.input, Sources.groupBy) :===: + Debug.groupBy(Resources.input, Sources.groupBy) :===: Resources.input .groupBy("Symbol") .agg( @@ -104,8 +67,20 @@ class DebugSource } it should "debug an orderBy transformation" in { - Debug[OrderBy].debug(Resources.input, Sources.orderBy) :===: + Debug.orderBy(Resources.input, Sources.orderBy) :===: Resources.input.orderBy("High") } + it should "debug a join transformation using column expressions" in { + Debug.join(Sources.join, Resources.input, Map("table2" -> Resources.input2)) :===: + Resources.input + .join(Resources.input2, Resources.input("id") === Resources.input2("id"), "inner") + } + + it should "debug a join transformation using a condition column expression" in { + Debug.join(Sources.join, Resources.input, Map("table2" -> Resources.input2)) :===: + Resources.input + .join(Resources.input2, expr("table1.id == table2.id"), "inner") + } + } diff --git a/semantic/src/test/scala/com/eff3ct/teckel/semantic/EvalAssetDebugSpec.scala b/semantic/src/test/scala/com/eff3ct/teckel/semantic/EvalAssetDebugSpec.scala new file mode 100644 index 0000000..310b374 --- /dev/null +++ b/semantic/src/test/scala/com/eff3ct/teckel/semantic/EvalAssetDebugSpec.scala @@ -0,0 +1,70 @@ +package com.eff3ct.teckel.semantic + +import com.eff3ct.teckel.model.Asset +import com.eff3ct.teckel.semantic.core.EvalAsset +import com.eff3ct.teckel.semantic.evaluation._ +import com.holdenkarau.spark.testing._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{max, min, sum} +import org.scalatest.flatspec.AnyFlatSpecLike +import org.scalatest.matchers.should.Matchers + +class EvalAssetDebugSpec + extends AnyFlatSpecLike + with Matchers + with DataFrameSuiteBase + with SparkTestUtils + with TestResources { + + "EvalAsset" should "debug an input asset" in { + val inputAsset: Asset = Assets.inputA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, inputAsset) + result :===: Resources.input + } + + it should "debug an output asset" in { + val outputAsset: Asset = Assets.outputA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, outputAsset) + result :===: Resources.input + } + + it should "debug a select asset" in { + val selectAsset: Asset = Assets.selectA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, selectAsset) + result :===: Resources.input.select("Symbol", "Date") + } + + it should "debug a where asset" in { + val whereAsset: Asset = Assets.whereA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, whereAsset) + result :===: Resources.input.where("Date > '2024-12-12'") + } + + it should "debug a groupBy asset" in { + val groupByAsset: Asset = Assets.groupByA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, groupByAsset) + result :===: Resources.input + .groupBy("Symbol") + .agg( + sum("Adj Close") as "TotalClose", + max("High") as "Highest", + min("Low") as "Lowest" + ) + } + + it should "debug a orderBy asset" in { + val orderByAsset: Asset = Assets.orderByA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, orderByAsset) + result :===: Resources.input.orderBy("High") + } + + it should "debug a join asset" in { + val joinAsset: Asset = Assets.joinA + val result: DataFrame = EvalAsset[DataFrame].eval(Assets.context, joinAsset) + result :===: Resources.input.join( + Resources.input2, + Resources.input("id") === Resources.input2("id"), + "inner" + ) + } +} diff --git a/semantic/src/test/scala/com/eff3ct/teckel/semantic/TestResources.scala b/semantic/src/test/scala/com/eff3ct/teckel/semantic/TestResources.scala new file mode 100644 index 0000000..e45c824 --- /dev/null +++ b/semantic/src/test/scala/com/eff3ct/teckel/semantic/TestResources.scala @@ -0,0 +1,87 @@ +package com.eff3ct.teckel.semantic + +import cats.data.NonEmptyList +import com.eff3ct.teckel.model.Source._ +import com.eff3ct.teckel.model.{Asset, Context} +import com.holdenkarau.spark.testing.DataFrameSuiteBase +import org.apache.spark.sql.DataFrame + +trait TestResources { + self: DataFrameSuiteBase with SparkTestUtils => + + object Resources { + val input: DataFrame = spark.read + .format("csv") + .option("header", "true") + .option("sep", "|") + .load("src/test/resources/data/csv/example.csv") + .as("table1") + + val input2: DataFrame = spark.read + .format("csv") + .option("header", "true") + .option("sep", "|") + .load("src/test/resources/data/csv/example-2.csv") + .as("table2") + } + + object Sources { + + val input: Input = + Input("csv", Map("header" -> "true", "sep" -> "|"), "src/test/resources/data/csv/example.csv") + + val input2: Input = + Input( + "csv", + Map("header" -> "true", "sep" -> "|"), + "src/test/resources/data/csv/example-2.csv" + ) + + val output: Output = + Output("table1", "parquet", "overwrite", Map(), "src/test/resources/data/parquet/example") + + val select: Select = Select("table1", NonEmptyList.of("Symbol", "Date")) + + val where: Where = Where("table1", "Date > '2024-12-12'") + + val groupBy: GroupBy = GroupBy( + "table1", + NonEmptyList.of("Symbol"), + NonEmptyList.of( + "sum(`Adj Close`) as TotalClose", + "max(High) as Highest", + "min(Low) as Lowest" + ) + ) + + val orderBy: OrderBy = OrderBy("table1", NonEmptyList.of("High"), Some("Asc")) + + val join: Join = + Join( + "table1", + NonEmptyList.of( + Relation("table2", "inner", List("table1.id == table2.id")) + ) + ) + + } + + object Assets { + val inputA: Asset = Asset("table1", Sources.input) + val input2A: Asset = Asset("table2", Sources.input2) + val outputA: Asset = Asset("table1", Sources.output) + val selectA: Asset = Asset("tableSelect", Sources.select) + val whereA: Asset = Asset("tableWhere", Sources.where) + val groupByA: Asset = Asset("tableGroupBy", Sources.groupBy) + val orderByA: Asset = Asset("tableOrderBy", Sources.orderBy) + val joinA: Asset = Asset("tableJoin", Sources.join) + val context: Context[Asset] = Map( + "table1" -> inputA, + "table2" -> input2A, + "tableSelect" -> selectA, + "tableWhere" -> whereA, + "tableGroupBy" -> groupByA, + "tableOrderBy" -> orderByA + ) + } +} diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala index 6799a31..9175209 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/operations.scala @@ -28,7 +28,7 @@ import cats.data.NonEmptyList import cats.implicits._ import io.circe.generic.auto._ import io.circe.syntax._ -import io.circe.{Decoder, Encoder} +import io.circe.{Decoder, Encoder, Json} object operations { @@ -40,6 +40,7 @@ object operations { case w: WhereOp => w.asJson case g: GroupByOp => g.asJson case o: OrderByOp => o.asJson + case j: JoinOp => j.asJson } implicit val decodeEvent: Decoder[Operation] = @@ -47,7 +48,8 @@ object operations { Decoder[SelectOp].widen, Decoder[WhereOp].widen, Decoder[GroupByOp].widen, - Decoder[OrderByOp].widen + Decoder[OrderByOp].widen, + Decoder[JoinOp].widen ).reduceLeft(_ or _) case class SelectOp(from: String, columns: NonEmptyList[String]) extends Operation @@ -57,4 +59,26 @@ object operations { case class OrderByOp(from: String, by: NonEmptyList[String], order: Option[String]) extends Operation + case class JoinOp(left: String, right: NonEmptyList[Relation]) extends Operation + + case class Relation(name: String, relationType: String, on: List[String]) + + implicit val encodeRelationType: Encoder[Relation] = + Encoder.instance { r => + Json.obj( + "name" -> r.name.asJson, + "type" -> r.relationType.asJson, + "on" -> r.on.asJson + ) + } + + implicit val decodeRelationType: Decoder[Relation] = + Decoder.instance { c => + for { + name <- c.downField("name").as[String] + relationType <- c.downField("type").as[String] + on <- c.downField("on").as[List[String]] + } yield Relation(name, relationType, on) + } + } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala index eebb1c8..94434f0 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/serializer/model/transformation.scala @@ -39,6 +39,7 @@ object transformation { case w: Where => w.asJson case g: GroupBy => g.asJson case o: OrderBy => o.asJson + case j: Join => j.asJson } implicit val decodeEvent: Decoder[Transformation] = @@ -46,12 +47,14 @@ object transformation { Decoder[Select].widen, Decoder[Where].widen, Decoder[GroupBy].widen, - Decoder[OrderBy].widen + Decoder[OrderBy].widen, + Decoder[Join].widen ).reduceLeft(_ or _) case class Select(name: String, select: SelectOp) extends Transformation case class Where(name: String, where: WhereOp) extends Transformation case class GroupBy(name: String, group: GroupByOp) extends Transformation case class OrderBy(name: String, order: OrderByOp) extends Transformation + case class Join(name: String, join: JoinOp) extends Transformation } diff --git a/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala b/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala index 1f7ad72..5c9c672 100644 --- a/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala +++ b/serializer/src/main/scala/com/eff3ct/teckel/transform/Rewrite.scala @@ -29,6 +29,7 @@ import cats.data.NonEmptyList import com.eff3ct.teckel.model.{Asset, Context, Source} import com.eff3ct.teckel.serializer.model.etl._ import com.eff3ct.teckel.serializer.model.input._ +import com.eff3ct.teckel.serializer.model.operations.Relation import com.eff3ct.teckel.serializer.model.output._ import com.eff3ct.teckel.serializer.model.transformation._ import com.eff3ct.teckel.serializer.types.PrimitiveType @@ -60,12 +61,19 @@ object Rewrite { def rewriteOp(item: OrderBy): Asset = Asset(item.name, Source.OrderBy(item.order.from, item.order.by, item.order.order)) + def rewriteOp(item: Join): Asset = + Asset(item.name, Source.Join(item.join.left, item.join.right.map(rewriteOp))) + + def rewriteOp(item: Relation): Source.Relation = + Source.Relation(item.name, item.relationType, item.on) + def rewrite(item: Transformation): Asset = item match { case s: Select => rewriteOp(s) case s: Where => rewriteOp(s) case s: GroupBy => rewriteOp(s) case s: OrderBy => rewriteOp(s) + case s: Join => rewriteOp(s) } def icontext(item: NonEmptyList[Input]): Context[Asset] = From 56bab1153bd627d733be1f3c0e4f606d7253ed45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafael=20Fern=C3=A1ndez?= <40717893+rafafrdz@users.noreply.github.com> Date: Tue, 28 Jan 2025 20:30:34 +0100 Subject: [PATCH 20/20] Fix Codecov CICD (#35) --- .github/workflows/ci.yml | 2 +- .github/workflows/release.yml | 2 +- project/SonatypePublish.scala | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e31057..078811f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - slug: eff3ct0/supabase-auth-scala + slug: eff3ct0/teckel files: target/scala-*/scoverage-report/scoverage.xml fail_ci_if_error: true diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f9d3ec1..6725ae9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - slug: eff3ct0/supabase-auth-scala + slug: eff3ct0/teckel files: target/scala-*/scoverage-report/scoverage.xml fail_ci_if_error: true diff --git a/project/SonatypePublish.scala b/project/SonatypePublish.scala index 4a42d9a..54411ba 100644 --- a/project/SonatypePublish.scala +++ b/project/SonatypePublish.scala @@ -13,12 +13,12 @@ object SonatypePublish { ThisBuild / sonatypeCredentialHost := sonatypeCentralHost, ThisBuild / organization := "com.eff3ct", ThisBuild / organizationName := "eff3ct", - ThisBuild / homepage := Some(url("https://github.com/rafafrdz/teckel")), + ThisBuild / homepage := Some(url("https://github.com/eff3ct0/teckel")), ThisBuild / licenses := Seq("MIT" -> url("https://opensource.org/licenses/MIT")), ThisBuild / scmInfo := Some( ScmInfo( - browseUrl = url("https://github.com/rafafrdz/teckel"), - connection = "scm:git:git@github.com:rafafrdz/teckel.git" + browseUrl = url("https://github.com/eff3ct0/teckel"), + connection = "scm:git:git@github.com:eff3ct0/teckel.git" ) ), ThisBuild / developers := List( @@ -30,7 +30,7 @@ object SonatypePublish { ) ), ThisBuild / sonatypeProjectHosting := Some( - GitHubHosting("rafafrdz", "teckel", "hi@rafaelfernandez.dev") + GitHubHosting("eff3ct0", "teckel", "hi@rafaelfernandez.dev") ) )