Data processing tools for data analysis
-
Install Miniforge
wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" bash Miniforge3-$(uname)-$(uname -m).sh
-
Clone the repository
rm -rf chrisdata* git clone [email protected]:chrisjihee/chrisdata.git cd chrisdata*
-
Create a new environment
mamba create -n chrisdata python=3.11 -y mamba activate chrisdata
-
Install the required packages
pip install -U -e . rm -rf chrisbase*; git clone [email protected]:chrisjihee/chrisbase.git pip install -U -e chrisbase* pip list | grep -E "mongo|search|Wiki|wiki|json|pydantic|chris|Flask"
-
Install MongoDB
mkdir mongodb; cd mongodb; mkdir data log if [ "$(uname)" = "Linux" ]; then aria2c https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-ubuntu2204-8.0.0.tgz elif [ "$(uname)" = "Darwin" ]; then aria2c https://fastdl.mongodb.org/osx/mongodb-macos-arm64-8.0.0.tgz fi tar zxvf mongodb-*.tgz --strip-components=1 cd ..
-
Run MongoDB
cd mongodb bin/mongod --config ../cfg/mongod-8800.yaml cd ..
cd mongodb bin/mongod --config ../cfg/mongod-8801.yaml cd ..
-
Install Elasticsearch
mkdir elasticsearch7; cd elasticsearch7 if [ "$(uname)" = "Linux" ]; then aria2c https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.10-linux-x86_64.tar.gz elif [ "$(uname)" = "Darwin" ]; then aria2c https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.17.10-darwin-aarch64.tar.gz fi tar zxf elasticsearch-*.tar.gz --strip-components 1 sed -i '' 's/#http.port: 9200/http.port: 9717/g' ./config/elasticsearch.yml echo "xpack.security.enabled: true" >> ./config/elasticsearch.yml cd ..
-
Link input data
cd input ln -s /mnt/geo/data/wikidata . ln -s /mnt/geo/data/wikipedia . cd ..
-
Show help
python -m chrisdata.cli --help
python -m chrisdata.cli wikipedia --help
python -m chrisdata.cli wikidata --help
-
Run command
-
To convert Wikipedia articles
python -m chrisdata.cli wikipedia convert
-
To parse Wikidata dump
python -m chrisdata.cli wikidata parse
-
To filter Wikidata entities
python -m chrisdata.cli wikidata filter
-
To convert Wikidata entities
python -m chrisdata.cli wikidata convert