From 2cc781de8d3ad451a0095185312c17f704541def Mon Sep 17 00:00:00 2001 From: Yang Heng Date: Sun, 16 Jan 2022 14:42:11 +0000 Subject: [PATCH] update_readme --- .gitignore | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++ README.MD | 43 +++++++++++---- 2 files changed, 188 insertions(+), 10 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..28d67be9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,155 @@ +# dev files +*.cache +*.dev.py +state_dict/ +*.results +*.tokenizer +*.model +*.state_dict +*.config +*.args +*.zip +*.gz +*.bin +*.result.txt +*.DS_Store +*.tmp +*.args.txt +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +*.pyc +*.result.json +.idea/ + +# Embedding +glove.840B.300d.txt +glove.42B.300d.txt +glove.twitter.27B.txt + +# project main files +release_note.json + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer training_logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +.DS_Store +.DS_Store +examples/.DS_Store diff --git a/README.MD b/README.MD index 4fe12080..1c64f0f1 100644 --- a/README.MD +++ b/README.MD @@ -6,28 +6,51 @@ [![total clones per week](https://raw.githubusercontent.com/yangheng95/ABSADatasets/traffic/total_clones_per_week.svg)](https://github.com/yangheng95/ABSADatasets/tree/traffic#-total-traffic-data-badge) ## Contribute (prepare) your dataset in using PyABSA -We hope you can share your custom dataset or a available public dataset. If you want to, follow these steps: +We hope you can share your custom dataset or an available public dataset. If you want to, follow these steps: ### Important: Rename your dataset filename before use it in PyABSA -- APC dataset name should be: {id}.dataset name}.{type}.dat , e.g., `996.restaurant.train.dat`, `996.restaurant.test.dat`, `996.restaurant.valid.dat` -- ATEPC dataset name should be: {id}.{dataset name}.{type}.dat.atepc , `e.g., 996.restaurant.train.dat.atepc`, `996.restaurant.test.dat.atepc`, `996.restaurant.valid.dat.atepc` +Although the integrated datasets have no ids, it is recommended to assign an id for your dataset. +While merge your datasets into ABSADatasets, please keep the id remained. + +- APC dataset name should be {id}.{dataset name}, and the dataset files should be named in. {id}.{dataset name}.{type}.dat.atepc e.g., +```tree +datasets +├── 996.restaurant +│ ├── 996.restaurant.train.dat # train_dataset +│ ├── 996.restaurant.test.dat # test_dataset +│ └── 996.restaurant.valid.dat # valid_dataset, dev set are not recognized in PyASBA, please rename dev-set to valid-set +└── others +``` + +- ATEPC dataset files should be {id}.{dataset name}.{type}.dat.atepc, +e.g., +```tree +datasets +├── 996.restaurant +│ ├── 996.restaurant.train.dat.atepc # train_dataset +│ ├── 996.restaurant.test.dat.atepc # test_dataset +│ └── 996.restaurant.valid.dat.atepc # valid_dataset, dev set are not recognized in PyASBA, please rename dev-set to valid-set +└── others +``` -Then, use the id to loacte your dataset, e.g., +Then, use the id to locate your dataset, e.g., ```python3 from pyabsa.functional import APCConfigManager +from pyabsa.functional import Trainer +from autocuda import auto_cuda config = APCConfigManager.get_apc_config_english() # APC task - Trainer(config=config, - dataset='996.restaurant', # train set and test set will be automatically detected - checkpoint_save_mode=1, - auto_device=device # automatic choose CUDA or CPU - ) +Trainer(config=config, + dataset='996.restaurant', # train set and test set will be automatically detected + checkpoint_save_mode=1, + auto_device=auto_cuda() # automatic choose CUDA or CPU + ) ``` -It will avoid some potential probelm while PyABSA detects the dataset. +It will avoid some potential problem (e.g., duplicated dataset name) while PyABSA detects the dataset. ### Dataset Processing - Format your APC dataset according to our dataset format. (**Recommended. Once you finished this step, we can help you to finish other steps**)