Skip to content

Commit

Permalink
chore: add shfmt (Unstructured-IO#2246)
Browse files Browse the repository at this point in the history
### Description
Given all the shell files that now exist in the repo, would be nice to
have linting/formatting around them (in addition to the existing
shellcheck which doesn't do anything to format the shell code). This PR
introduces `shfmt` to both check for changes and apply formatting when
the associated make targets are called.
  • Loading branch information
rbiseck3 authored Dec 12, 2023
1 parent 529d1f6 commit 76efcf4
Show file tree
Hide file tree
Showing 117 changed files with 2,380 additions and 2,412 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ jobs:
- name: ShellCheck
uses: ludeeus/action-shellcheck@master

shfmt:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: setup shfmt
uses: mfinelli/setup-shfmt@v3
- name: Run shfmt
run: shfmt -d .


test_unit:
strategy:
Expand Down
13 changes: 12 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ test-extra-xlsx:
.PHONY: check
check: check-ruff check-black check-flake8 check-version check-flake8-print

.PHONY: check-shfmt
check-shfmt:
shfmt -d .

.PHONY: check-black
check-black:
black . --check
Expand Down Expand Up @@ -382,7 +386,14 @@ check-version:

## tidy: run black
.PHONY: tidy
tidy:
tidy: tidy-python

.PHONY: tidy_shell
tidy-shell:
shfmt -l -w .

.PHONY: tidy-python
tidy-python:
ruff . --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --fix-only --ignore COM812,PT011,PT012,SIM117 || true
autoflake --in-place .
black .
Expand Down
12 changes: 6 additions & 6 deletions examples/ingest/airtable/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
# base1/view1 → has to mention table to be valid

PYTHONPATH=. ./unstructured/ingest/main.py \
airtable \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
--output-dir airtable-ingest-output \
--num-processes 2 \
--reprocess
airtable \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
--output-dir airtable-ingest-output \
--num-processes 2 \
--reprocess
12 changes: 6 additions & 6 deletions examples/ingest/azure/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

# Structured outputs are stored in azure-ingest-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
azure \
--remote-url abfs://container1/ \
--account-name azureunstructured1 \
--output-dir azure-ingest-output \
--num-processes 2
azure \
--remote-url abfs://container1/ \
--account-name azureunstructured1 \
--output-dir azure-ingest-output \
--num-processes 2
24 changes: 12 additions & 12 deletions examples/ingest/azure_cognitive_search/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@

# Structured outputs are stored in azure-ingest-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
s3 \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--anonymous \
--output-dir s3-small-batch-output-to-azure \
--num-processes 2 \
--verbose \
--strategy fast \
azure-cognitive-search \
--key "$AZURE_SEARCH_API_KEY" \
--endpoint "$AZURE_SEARCH_ENDPOINT" \
--index utic-test-ingest-fixtures-output
s3 \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--anonymous \
--output-dir s3-small-batch-output-to-azure \
--num-processes 2 \
--verbose \
--strategy fast \
azure-cognitive-search \
--key "$AZURE_SEARCH_API_KEY" \
--endpoint "$AZURE_SEARCH_ENDPOINT" \
--index utic-test-ingest-fixtures-output
17 changes: 8 additions & 9 deletions examples/ingest/biomed/ingest-with-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,17 @@
# For example, to download documents from 2019-01-02 00:00:00 to 2019-01-02+00:03:10"
# the parameters "from" and "until" are needed

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \
--api-from "2019-01-02" \
--api-until "2019-01-02+00:03:10" \
--output-dir biomed-ingest-output-api \
--num-processes 2 \
--verbose \
--preserve-downloads

biomed \
--api-from "2019-01-02" \
--api-until "2019-01-02+00:03:10" \
--output-dir biomed-ingest-output-api \
--num-processes 2 \
--verbose \
--preserve-downloads

# Alternatively, you can call it using:
# unstructured-ingest --biomed-api ...
15 changes: 7 additions & 8 deletions examples/ingest/biomed/ingest-with-path.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
# For example, to download the documents in the path: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/07/
# The path needed is oa_pdf/07/


SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

# The example below will ingest the PDF from the "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" path.
Expand All @@ -24,12 +23,12 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
# WARNING: There are many documents in that path.

PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
--output-dir biomed-ingest-output-path \
--num-processes 2 \
--verbose \
--preserve-downloads
biomed \
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
--output-dir biomed-ingest-output-path \
--num-processes 2 \
--verbose \
--preserve-downloads

# Alternatively, you can call it using:
# unstructured-ingest --biomed-path ...
17 changes: 8 additions & 9 deletions examples/ingest/box/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
# and set up the app config.json file here:
# https://developer.box.com/guides/authentication/jwt/with-sdk/


SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
box \
--box-app-config "$BOX_APP_CONFIG_PATH" \
--remote-url box://utic-test-ingest-fixtures \
--output-dir box-output \
--num-processes 2 \
--recursive \
--verbose
box \
--box-app-config "$BOX_APP_CONFIG_PATH" \
--remote-url box://utic-test-ingest-fixtures \
--output-dir box-output \
--num-processes 2 \
--recursive \
--verbose
14 changes: 7 additions & 7 deletions examples/ingest/confluence/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ cd "$SCRIPT_DIR"/../../.. || exit 1
# --max-num-of-docs-from-each-space 250 \
# --> The maximum number of documents to be ingested from each space. Set as 250 in the example.
PYTHONPATH=. ./unstructured/ingest/main.py \
confluence \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--url https://unstructured-ingest-test.atlassian.net \
--user-email [email protected] \
--api-token ABCDE1234ABDE1234ABCDE1234 \
--output-dir confluence-ingest-output \
--num-processes 2
confluence \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--url https://unstructured-ingest-test.atlassian.net \
--user-email [email protected] \
--api-token ABCDE1234ABDE1234ABCDE1234 \
--output-dir confluence-ingest-output \
--num-processes 2
24 changes: 12 additions & 12 deletions examples/ingest/delta_table/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@

# AWS credentials need to be available for use with the storage options
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echo "aws credentials not found as env vars"
exit 0
echo "aws credentials not found as env vars"
exit 0
fi

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
delta-table \
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
--output-dir delta-table-output \
--num-processes 2 \
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
--verbose \
delta-table \
--write-column json_data \
--table-uri delta-table-dest
delta-table \
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
--output-dir delta-table-output \
--num-processes 2 \
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
--verbose \
delta-table \
--write-column json_data \
--table-uri delta-table-dest
16 changes: 8 additions & 8 deletions examples/ingest/discord/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

# Structured outputs are stored in discord-example/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
discord \
--channels 12345678 \
--token "$DISCORD_TOKEN" \
--download-dir discord-ingest-download \
--output-dir discord-example \
--preserve-downloads \
--verbose
discord \
--channels 12345678 \
--token "$DISCORD_TOKEN" \
--download-dir discord-ingest-download \
--output-dir discord-example \
--preserve-downloads \
--verbose
18 changes: 8 additions & 10 deletions examples/ingest/dropbox/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,14 @@

# Structured outputs are stored in dropbox-output/


SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1


PYTHONPATH=. ./unstructured/ingest/main.py \
dropbox \
--remote-url "dropbox:// /" \
--output-dir dropbox-output \
--token "$DROPBOX_TOKEN" \
--num-processes 2 \
--recursive \
--verbose
dropbox \
--remote-url "dropbox:// /" \
--output-dir dropbox-output \
--token "$DROPBOX_TOKEN" \
--num-processes 2 \
--recursive \
--verbose
14 changes: 7 additions & 7 deletions examples/ingest/elasticsearch/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ wait
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT

PYTHONPATH=. ./unstructured/ingest/main.py \
elasticsearch \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--url http://localhost:9200 \
--index-name movies \
--jq-query '{ethnicity, director, plot}' \
--output-dir elasticsearch-ingest-output \
--num-processes 2
elasticsearch \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
--url http://localhost:9200 \
--index-name movies \
--jq-query '{ethnicity, director, plot}' \
--output-dir elasticsearch-ingest-output \
--num-processes 2
14 changes: 7 additions & 7 deletions examples/ingest/github/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

# Structured outputs are stored in github-ingest-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
github \
--url Unstructured-IO/unstructured \
--git-branch main \
--output-dir github-ingest-output \
--num-processes 2 \
--verbose
github \
--url Unstructured-IO/unstructured \
--git-branch main \
--output-dir github-ingest-output \
--num-processes 2 \
--verbose

# Alternatively, you can call it using:
# unstructured-ingest github --url ...
14 changes: 7 additions & 7 deletions examples/ingest/gitlab/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

# Structured outputs are stored in gitlab-ingest-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
gitlab \
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
--git-branch 'v0.0.7' \
--output-dir gitlab-ingest-output \
--num-processes 2 \
--verbose
gitlab \
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
--git-branch 'v0.0.7' \
--output-dir gitlab-ingest-output \
--num-processes 2 \
--verbose

# Alternatively, you can call it using:
# unstructured-ingest gitlab --gitlab-url ...
14 changes: 7 additions & 7 deletions examples/ingest/google_cloud_storage/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@

# Structured outputs are stored in gcs-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
gcs \
--remote-url gs://utic-test-ingest-fixtures-public/ \
--output-dir gcs-output \
--num-processes 2 \
--recursive \
--verbose
gcs \
--remote-url gs://utic-test-ingest-fixtures-public/ \
--output-dir gcs-output \
--num-processes 2 \
--recursive \
--verbose
Loading

0 comments on commit 76efcf4

Please sign in to comment.