Skip to content

Commit

Permalink
Add Demo for OSDS (apache#173)
Browse files Browse the repository at this point in the history
* Add demo

* update notebook state, more ignore, add Readme

* make sure jars directory is created

* update instructions to tell users to use local host option
  • Loading branch information
the-other-tim-brown authored Nov 8, 2023
1 parent b8e40de commit 5d40c6f
Show file tree
Hide file tree
Showing 30 changed files with 5,622 additions and 26 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,8 @@ build
.DS_Store

target/

# Demo files
*.crc
demo/jars/*
demo/notebook/.ipynb_checkpoints/*
14 changes: 0 additions & 14 deletions core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-collection-compat_${scala.version.prefix}</artifactId>
</dependency>

<!-- Hudi dependencies -->
<dependency>
Expand Down Expand Up @@ -119,21 +115,11 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version.prefix}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<scope>test</scope>
</dependency>

<!-- Mockito -->
<dependency>
Expand Down
10 changes: 10 additions & 0 deletions demo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Running a Local Demo
This demo was created for the 2023 Open Source Data Summit. It shows how OneTable can be used with two existing datasets.

Use `./start_demo.sh` to spin up a local notebook with a scala interpreter, Hive Metastore, and Trino in docker containers. The script will first build the OneTable jars required for the demo and then start the containers.

## Accessing Services
### Jupyter Notebook
To access the notebook, look for a log line during startup that contains `To access the server, open this file in a browser: ... Or copy and paste one of these URLs: ...` and use the `http://127.0.0.1:8888/...` url to open the notebook in your browser. The demo is located at `work/demo.ipynb`.
### Trino
You can access the local Trino container by running `docker exec -it trino trino`
5,001 changes: 5,001 additions & 0 deletions demo/data/DimCustomer_round2.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"0109bb2f-b074-44de-b859-87a2641584a8","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"GeographyKey\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"GeographyType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ContinentName\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"CityName\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StateProvinceName\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"RegionCountryName\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1698961547524}}
{"add":{"path":"part-00000-6ccb71bc-9cc1-4226-a7b5-f752edbedee6-c000.snappy.parquet","partitionValues":{},"size":12822,"modificationTime":1698961549532,"dataChange":true,"stats":"{\"numRecords\":674,\"minValues\":{\"GeographyKey\":\"1\",\"GeographyType\":\"City\",\"ContinentName\":\"Asia\",\"CityName\":\"Albany\",\"StateProvinceName\":\"Ahal Province\",\"RegionCountryName\":\"Armenia\"},\"maxValues\":{\"GeographyKey\":\"952\",\"GeographyType\":\"State/Province\",\"ContinentName\":\"North America\",\"CityName\":\"York\",\"StateProvinceName\":\"Yveline\",\"RegionCountryName\":\"the Netherlands\"},\"nullCount\":{\"GeographyKey\":0,\"GeographyType\":0,\"ContinentName\":0,\"CityName\":157,\"StateProvinceName\":37,\"RegionCountryName\":3}}"}}
{"commitInfo":{"timestamp":1698961549602,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"674","numOutputBytes":"12822"},"engineInfo":"Apache-Spark/3.2.3 Delta-Lake/2.0.2","txnId":"a674dea2-f76a-4db8-b6d6-2ec8214428de"}}
Binary file not shown.
38 changes: 38 additions & 0 deletions demo/data/db/hudi_dimCustomer/.hoodie/20231105172136406.commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"partitionToWriteStats" : {
"" : [ {
"fileId" : "b1ee44c7-255b-47bf-b788-1e7017790326-0",
"path" : "b1ee44c7-255b-47bf-b788-1e7017790326-0_0-80-172_20231105172136406.parquet",
"cdcStats" : null,
"prevCommit" : "null",
"numWrites" : 5000,
"numDeletes" : 0,
"numUpdateWrites" : 0,
"numInserts" : 5000,
"totalWriteBytes" : 79392,
"totalWriteErrors" : 0,
"tempPath" : null,
"partitionPath" : "",
"totalLogRecords" : 0,
"totalLogFilesCompacted" : 0,
"totalLogSizeCompacted" : 0,
"totalUpdatedRecordsCompacted" : 0,
"totalLogBlocks" : 0,
"totalCorruptLogBlock" : 0,
"totalRollbackBlocks" : 0,
"fileSizeInBytes" : 79392,
"minEventTime" : null,
"maxEventTime" : null,
"runtimeStats" : {
"totalScanTime" : 0,
"totalUpsertTime" : 0,
"totalCreateTime" : 700
}
} ]
},
"compacted" : false,
"extraMetadata" : {
"schema" : "{\"type\":\"record\",\"name\":\"Sample\",\"fields\":[{\"name\":\"_c0\",\"type\":\"string\"},{\"name\":\"CustomerKey\",\"type\":\"string\"},{\"name\":\"GeographyKey\",\"type\":\"string\"},{\"name\":\"FirstName\",\"type\":\"string\"},{\"name\":\"LastName\",\"type\":\"string\"},{\"name\":\"BirthDate\",\"type\":\"string\"},{\"name\":\"MaritalStatus\",\"type\":\"string\"},{\"name\":\"Gender\",\"type\":\"string\"},{\"name\":\"YearlyIncome\",\"type\":\"string\"},{\"name\":\"TotalChildren\",\"type\":\"string\"},{\"name\":\"NumberChildrenAtHome\",\"type\":\"string\"},{\"name\":\"Education\",\"type\":\"string\"},{\"name\":\"Occupation\",\"type\":\"string\"},{\"name\":\"HouseOwnerFlag\",\"type\":\"string\"},{\"name\":\"NumberCarsOwned\",\"type\":\"string\"}]}"
},
"operationType" : "INSERT"
}
Empty file.
32 changes: 32 additions & 0 deletions demo/data/db/hudi_dimCustomer/.hoodie/20231105172136406.inflight
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"partitionToWriteStats" : {
"" : [ {
"fileId" : "",
"path" : null,
"cdcStats" : null,
"prevCommit" : "null",
"numWrites" : 0,
"numDeletes" : 0,
"numUpdateWrites" : 0,
"numInserts" : 5000,
"totalWriteBytes" : 0,
"totalWriteErrors" : 0,
"tempPath" : null,
"partitionPath" : null,
"totalLogRecords" : 0,
"totalLogFilesCompacted" : 0,
"totalLogSizeCompacted" : 0,
"totalUpdatedRecordsCompacted" : 0,
"totalLogBlocks" : 0,
"totalCorruptLogBlock" : 0,
"totalRollbackBlocks" : 0,
"fileSizeInBytes" : 0,
"minEventTime" : null,
"maxEventTime" : null,
"runtimeStats" : null
} ]
},
"compacted" : false,
"extraMetadata" : { },
"operationType" : "INSERT"
}
23 changes: 23 additions & 0 deletions demo/data/db/hudi_dimCustomer/.hoodie/hoodie.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#Updated at 2023-11-05T17:21:37.867Z
#Sun Nov 05 17:21:37 GMT 2023
hoodie.table.timeline.timezone=LOCAL
hoodie.table.keygenerator.class=org.apache.hudi.keygen.NonpartitionedKeyGenerator
hoodie.table.precombine.field=_c0
hoodie.table.version=6
hoodie.database.name=
hoodie.datasource.write.hive_style_partitioning=false
hoodie.table.metadata.partitions.inflight=
hoodie.table.checksum=4138190594
hoodie.partition.metafile.use.base.format=false
hoodie.table.cdc.enabled=false
hoodie.archivelog.folder=archived
hoodie.table.name=hudi_dimCustomer
hoodie.populate.meta.fields=false
hoodie.table.type=COPY_ON_WRITE
hoodie.datasource.write.partitionpath.urlencode=false
hoodie.table.base.file.format=PARQUET
hoodie.datasource.write.drop.partition.columns=false
hoodie.table.metadata.partitions=files
hoodie.timeline.layout.version=1
hoodie.table.recordkey.fields=CustomerKey
hoodie.table.partition.fields=
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"partitionToWriteStats" : {
"files" : [ {
"fileId" : "files-0000-0",
"path" : "files/files-0000-0_0-73-165_00000000000000010.hfile",
"cdcStats" : null,
"prevCommit" : "null",
"numWrites" : 1,
"numDeletes" : 0,
"numUpdateWrites" : 0,
"numInserts" : 1,
"totalWriteBytes" : 6809,
"totalWriteErrors" : 0,
"tempPath" : null,
"partitionPath" : "files",
"totalLogRecords" : 0,
"totalLogFilesCompacted" : 0,
"totalLogSizeCompacted" : 0,
"totalUpdatedRecordsCompacted" : 0,
"totalLogBlocks" : 0,
"totalCorruptLogBlock" : 0,
"totalRollbackBlocks" : 0,
"fileSizeInBytes" : 6809,
"minEventTime" : null,
"maxEventTime" : null,
"runtimeStats" : {
"totalScanTime" : 0,
"totalUpsertTime" : 0,
"totalCreateTime" : 152
}
} ]
},
"compacted" : false,
"extraMetadata" : {
"schema" : "{\"type\":\"record\",\"name\":\"HoodieMetadataRecord\",\"namespace\":\"org.apache.hudi.avro.model\",\"doc\":\"A record saved within the Metadata Table\",\"fields\":[{\"name\":\"key\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"type\",\"type\":\"int\",\"doc\":\"Type of the metadata record\"},{\"name\":\"filesystemMetadata\",\"type\":[\"null\",{\"type\":\"map\",\"values\":{\"type\":\"record\",\"name\":\"HoodieMetadataFileInfo\",\"fields\":[{\"name\":\"size\",\"type\":\"long\",\"doc\":\"Size of the file\"},{\"name\":\"isDeleted\",\"type\":\"boolean\",\"doc\":\"True if this file has been deleted\"}]},\"avro.java.string\":\"String\"}],\"doc\":\"Contains information about partitions and files within the dataset\"},{\"name\":\"BloomFilterMetadata\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"HoodieMetadataBloomFilter\",\"doc\":\"Data file bloom filter details\",\"fields\":[{\"name\":\"type\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"},\"doc\":\"Bloom filter type code\"},{\"name\":\"timestamp\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"},\"doc\":\"Instant timestamp when this metadata was created/updated\"},{\"name\":\"bloomFilter\",\"type\":\"bytes\",\"doc\":\"Bloom filter binary byte array\"},{\"name\":\"isDeleted\",\"type\":\"boolean\",\"doc\":\"Bloom filter entry valid/deleted flag\"}]}],\"doc\":\"Metadata Index of bloom filters for all data files in the user table\",\"default\":null},{\"name\":\"ColumnStatsMetadata\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"HoodieMetadataColumnStats\",\"doc\":\"Data file column statistics\",\"fields\":[{\"name\":\"fileName\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}],\"doc\":\"File name for which this column statistics applies\",\"default\":null},{\"name\":\"columnName\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}],\"doc\":\"Column name for which this column statistics applies\",\"default\":null},{\"name\":\"minValue\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"BooleanWrapper\",\"doc\":\"A record wrapping boolean type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"boolean\"}]},{\"type\":\"record\",\"name\":\"IntWrapper\",\"doc\":\"A record wrapping int type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"int\"}]},{\"type\":\"record\",\"name\":\"LongWrapper\",\"doc\":\"A record wrapping long type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"long\"}]},{\"type\":\"record\",\"name\":\"FloatWrapper\",\"doc\":\"A record wrapping float type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"float\"}]},{\"type\":\"record\",\"name\":\"DoubleWrapper\",\"doc\":\"A record wrapping double type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"double\"}]},{\"type\":\"record\",\"name\":\"BytesWrapper\",\"doc\":\"A record wrapping bytes type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"bytes\"}]},{\"type\":\"record\",\"name\":\"StringWrapper\",\"doc\":\"A record wrapping string type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}]},{\"type\":\"record\",\"name\":\"DateWrapper\",\"doc\":\"A record wrapping Date logical type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"int\"}]},{\"type\":\"record\",\"name\":\"DecimalWrapper\",\"doc\":\"A record wrapping Decimal logical type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":{\"type\":\"bytes\",\"logicalType\":\"decimal\",\"precision\":30,\"scale\":15}}]},{\"type\":\"record\",\"name\":\"TimeMicrosWrapper\",\"doc\":\"A record wrapping Time-micros logical type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":{\"type\":\"long\",\"logicalType\":\"time-micros\"}}]},{\"type\":\"record\",\"name\":\"TimestampMicrosWrapper\",\"doc\":\"A record wrapping Timestamp-micros logical type to be able to be used it w/in Avro's Union\",\"fields\":[{\"name\":\"value\",\"type\":\"long\"}]}],\"doc\":\"Minimum value in the range. Based on user data table schema, we can convert this to appropriate type\",\"default\":null},{\"name\":\"maxValue\",\"type\":[\"null\",\"BooleanWrapper\",\"IntWrapper\",\"LongWrapper\",\"FloatWrapper\",\"DoubleWrapper\",\"BytesWrapper\",\"StringWrapper\",\"DateWrapper\",\"DecimalWrapper\",\"TimeMicrosWrapper\",\"TimestampMicrosWrapper\"],\"doc\":\"Maximum value in the range. Based on user data table schema, we can convert it to appropriate type\",\"default\":null},{\"name\":\"valueCount\",\"type\":[\"null\",\"long\"],\"doc\":\"Total count of values\",\"default\":null},{\"name\":\"nullCount\",\"type\":[\"null\",\"long\"],\"doc\":\"Total count of null values\",\"default\":null},{\"name\":\"totalSize\",\"type\":[\"null\",\"long\"],\"doc\":\"Total storage size on disk\",\"default\":null},{\"name\":\"totalUncompressedSize\",\"type\":[\"null\",\"long\"],\"doc\":\"Total uncompressed storage size on disk\",\"default\":null},{\"name\":\"isDeleted\",\"type\":\"boolean\",\"doc\":\"Column range entry valid/deleted flag\"}]}],\"doc\":\"Metadata Index of column statistics for all data files in the user table\",\"default\":null},{\"name\":\"recordIndexMetadata\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"HoodieRecordIndexInfo\",\"fields\":[{\"name\":\"partitionName\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}],\"doc\":\"Refers to the partition name the record belongs to\",\"default\":null},{\"name\":\"fileIdHighBits\",\"type\":[\"null\",\"long\"],\"doc\":\"Refers to high 64 bits if the fileId is based on UUID format. \\nA UUID based fileId is stored as 3 pieces in RLI (fileIdHighBits, fileIdLowBits and fileIndex). \\nFileID format is {UUID}-{fileIndex}.\",\"default\":null},{\"name\":\"fileIdLowBits\",\"type\":[\"null\",\"long\"],\"doc\":\"Refers to low 64 bits if the fileId is based on UUID format. \\nA UUID based fileId is stored as 3 pieces in RLI (fileIdHighBits, fileIdLowBits and fileIndex). \\nFileID format is {UUID}-{fileIndex}.\",\"default\":null},{\"name\":\"fileIndex\",\"type\":[\"null\",\"int\"],\"doc\":\"Index representing file index which is used to re-construct UUID based fileID. Applicable when the fileId is based on UUID format. \\nA UUID based fileId is stored as 3 pieces in RLI (fileIdHighBits, fileIdLowBits and fileIndex). \\nFileID format is {UUID}-{fileIndex}.\",\"default\":null},{\"name\":\"fileId\",\"type\":[\"null\",{\"type\":\"string\",\"avro.java.string\":\"String\"}],\"doc\":\"Represents fileId of the location where record belongs to. When the encoding is 1, fileID is stored in raw string format.\",\"default\":null},{\"name\":\"instantTime\",\"type\":[\"null\",\"long\"],\"doc\":\"Epoch time in millisecond representing the commit time at which record was added\",\"default\":null},{\"name\":\"fileIdEncoding\",\"type\":\"int\",\"doc\":\"Represents fileId encoding. Possible values are 0 and 1. O represents UUID based fileID, and 1 represents raw string format of the fileId. \\nWhen the encoding is 0, reader can deduce fileID from fileIdLowBits, fileIdLowBits and fileIndex.\",\"default\":0}]}],\"doc\":\"Metadata Index that contains information about record keys and their location in the dataset\",\"default\":null}]}"
},
"operationType" : "BULK_INSERT"
}
Loading

0 comments on commit 5d40c6f

Please sign in to comment.