Upload preprocessed kg and update README

THUDM · Oct 7, 2019 · 3ced3f8 · 3ced3f8
1 parent f230357
commit 3ced3f8
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -6,4 +6,61 @@ Towards **K**nowledge-**B**ased **R**ecommender **D**ialog System.<br>
 [Qibin Chen](https://www.qibin.ink), [Junyang Lin](https://justinlin610.github.io), Yichang Zhang, Ming Ding, [Yukuo Cen](https://sites.google.com/view/yukuocen), [Hongxia Yang](https://sites.google.com/site/hystatistics/home), [Jie Tang](http://keg.cs.tsinghua.edu.cn/jietang/).<br>
 In EMNLP-IJCNLP 2019
 
-**Under construction.**
+## Prerequisites
+
+- Linux
+- Python 3.6
+- PyTorch 1.2.0
+
+## Getting Started
+
+### Installation
+
+Clone this repo.
+
+```bash
+git clone https://github.com/THUDM/KBRD
+cd KBRD
+```
+
+Please install dependencies by
+
+```bash
+pip install -r requirements.txt
+```
+
+### Dataset
+
+- We use the **TaoDescribe** dataset, which will be automatically downloaded by the script.
+- Download the refined knowledge base (dbpedia) used in this paper [here](https://cloud.tsinghua.edu.cn/f/6af126bdccc44352bfee/?dl=1). Decompress it and get the `dbpedia/` folder, which should contain two files `mappingbased_objects_en.ttl` and `short_abstracts_en.ttl`.
+
+### Training
+
+```bash
+bash scripts/both.sh <num_exps> <gpu_id>
+```
+
+### Logging
+
+TensorBoard logs and models will be saved in `saved/` folder.
+
+### Evaluation
+
+- `show_bias.py` is used to show the vocabulary bias of a specific movie (like in Table 4)
+
+TODO
+
+If you have difficulties to get things working in the above steps, please let us know.
+
+## Cite
+
+Please cite our paper if you use this code in your own work:
+
+```
+@article{chen2019towards,
+  title={Towards Knowledge-Based Recommender Dialog System},
+  author={Chen, Qibin and Lin, Junyang and Zhang, Yichang and Ding, Ming and Cen, Yukuo and Yang, Hongxia and Tang, Jie},
+  journal={arXiv preprint arXiv:1908.05391},
+  year={2019}
+}
+```
diff --git a/scripts/both.sh b/scripts/both.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
-let num_runs=32
+let num_runs=$1
+let gpu_id=$2
 
 for i in $(seq 0 $((num_runs-1)));
 do
-    CUDA_VISIBLE_DEVICES=2 python parlai/tasks/redial/train_kbrd.py -mf saved/both_rgcn_$i
+    CUDA_VISIBLE_DEVICES=$gpu_id python parlai/tasks/redial/train_kbrd.py -mf saved/both_rgcn_$i
 done
 
diff --git a/scripts/diversity_scorer.py b/scripts/diversity_scorer.py
@@ -0,0 +1,37 @@
+import argparse
+import numpy as np
+from tqdm import tqdm
+
+
+def generate_n_grams(x, n):
+    n_grams = set(zip(*[x[i:] for i in range(n)]))
+    # print(x, n_grams)
+    # for n_gram in n_grams:
+    #     x.append(' '.join(n_gram))
+    return n_grams
+
+
+def distinct_n_grams(tokenized_lines, n):
+
+    n_grams_all = set()
+    for line in tokenized_lines:
+        n_grams = generate_n_grams(line, n)
+        # print(line, n_grams)
+        n_grams_all |= n_grams
+
+    return len(set(n_grams_all)), len(set(n_grams_all)) / len(tokenized_lines)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input', type=str)
+    args = parser.parse_args()
+    with open(args.input) as f:
+        lines = f.read().strip().split('\n')
+        # tokenized = [line.split()[3:-1] for line in lines]
+        tokenized = [line.split()[1:] for line in lines]
+        print(tokenized[:5])
+
+    for n in range(1, 6):
+        cnt, percent = distinct_n_grams(tokenized, n)
+        print(f'Distinct {n}-grams (cnt, percentage) = ({cnt}, {percent:.3f})')
diff --git a/scripts/export_triples.py b/scripts/export_triples.py
@@ -0,0 +1,23 @@
+import pickle as pkl
+
+if __name__ == "__main__":
+    subkg = pkl.load(open('data/redial/subkg.pkl', 'rb'))
+    entity2entityId = pkl.load(open('data/redial/entity2entityId.pkl', 'rb'))
+    entityId2entity = dict([(entity2entityId[k], k) for k in entity2entityId])
+    relation2relationId = pkl.load(open('data/redial/relation2relationId.pkl', 'rb'))
+    relationId2relation = dict([(relation2relationId[k], k) for k in relation2relationId])
+
+    triples = []
+    for headId in subkg:
+        for relationId, tailId in subkg[headId]:
+            head = entityId2entity[headId]
+            relation = relationId2relation[relationId]
+            tail = entityId2entity[tailId]
+            if relation == 'self_loop':
+                continue
+            triples.append(f"{head} {relation} {tail} \n")
+
+    print(len(triples))
+    with open('triples.txt', 'w') as f:
+        f.writelines(triples)
+
diff --git a/scripts/onlymovie.sh b/scripts/onlymovie.sh