debug line test

RabbitBear00 · Jan 8, 2021 · f0e3db0 · f0e3db0
1 parent a48cb72
commit f0e3db0
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 31,885 deletions.
diff --git a/README.md b/README.md
@@ -70,12 +70,20 @@ Score range: 0 - 10
 ## Data structure
 The following tree shows the file structure of this corpus:
 ```
-├── scripts
+├── scores.json
+├── scores-detail.json
+├── dev
+│   ├── spk2age
+│   ├── spk2gender
+│   ├── spk2utt
+│   ├── text
+│   ├── utt2spk
+│   └── wav.scp
+├── test
 │   ├── spk2age
 │   ├── spk2gender
 │   ├── spk2utt
 │   ├── text
-│   ├── utt2score
 │   ├── utt2spk
 │   └── wav.scp
 └── WAVE
@@ -95,82 +103,113 @@ The following tree shows the file structure of this corpus:
         ├── ...
 ```
 
-Most files in `scripts` use Kaldi's data style.
-The most important file is `utt2score`, whose format is similar with `text`, while the second column of each line is a JSON:
+There are two datasets: `dev` and `test`, and both are in Kaldi's data directory style.
 
-```
-000010011	{"text": "we call it bear", "total": [7.6, 9.0, 7.9, 6.4, 9.1], "accuracy": [7,  ... }
-000010053	{"text": "three two two seven", "total": [7.6, 10.0, 8.9, 9.0, 9.2], "accuracy": ... }
-000010063	{"text": "elephants tai goose", "total": [10.0, 9.8, 8.9, 8.2, 9.9], "accuracy": ... }
+The scores are stored in `scores.json`. Here is an example:
+
+```json
+{
+    "000010011": {                                     # utt-id
+        "text": "WE CALL IT BEAR",                     # transcript text
+        "accuracy": 8,                                 # sentence-level accuracy score
+        "completeness": 10.0,                          # sentence-level completeness score
+        "fluency": 9,                                  # sentence-level fluency score
+        "prosodic": 9,                                 # sentence-level prosodic score
+        "total": 8,                                    # sentence-level total score
+        "words": [
+            {
+                "accuracy": 10,                        # word-level accuracy score
+                "stress": 10,                          # word-level stress score
+                "total": 10,                           # word-level total score
+                "text": "WE",                          # the word text
+                "phones": "W IY0",                     # phones of the word                        
+                "phones-accuracy": [2.0, 2.0]          # phoneme-level accuracy score
+            },
+            {
+                "accuracy": 10,
+                "stress": 10,
+                "total": 10,
+                "text": "CALL",
+                "phones": "K AO0 L",
+                "phones-accuracy": [2.0, 1.8, 1.8]
+            },
+            {
+                "accuracy": 10,
+                "stress": 10,
+                "total": 10,
+                "text": "IT",
+                "phones": "IH0 T",
+                "phones-accuracy": [2.0, 2.0]
+            },
+            {
+                "accuracy": 6,
+                "stress": 10,
+                "total": 6,
+                "text": "BEAR",
+                "phones": "B EH0 R",
+                "phones-accuracy": [2.0, 1.0, 1.0]
+            }
+        ]
+    },
+    ...
+}
 ```
 
+The file `scores.json` are processed from `scores-detail.json`.
+The two JSON files are almostly same, but `scores-detail.json` has the original scores of the five experts,
+while the scores of `scores.json` was the average or median scores.
 
-### An example of JSON line in `utt2score`
+An example item in `scores-detail.json`:
 ```json
 {
-    "text": "we call it bear",
-    "total": [7.6, 9, 7.9, 6.4, 9.1],
-    "accuracy": [7, 9, 8, 8, 9],
-    "completeness": [1, 1, 1, 0.75, 1],
-    "fluency": [10, 9, 8, 8, 10],
-    "prosodic": [10, 9, 7, 8, 9],
-    "words": [
-        {
-            "text": "we",
-            "total": [10, 10, 10, 10, 10],
-            "accuracy": [10, 10, 10, 10, 10],
-            "stress": [10, 10, 10, 10, 10],
-            "phones": ["W IY",
-                       "W IY",
-                       "W IY",
-                       "W IY",
-                       "W IY"
-           ]
-        },
-        {
-            "text": "call",
-            "total": [10, 8.4, 10, 10, 8.4],
-            "accuracy": [10, 8, 10, 10, 8],
-            "stress": [10, 10, 10, 10, 10],
-            "phones": [
-                "K AO L",
-                "K {AO} L",
-                "K AO L",
-                "K AO L",
-                "K AO {L}"
-           ]
-        },
-        {
-            "text": "it",
-            "total": [10, 10, 10, 10, 10],
-            "accuracy": [10, 10, 10, 10, 10],
-            "stress": [10, 10, 10, 10, 10],
-            "phones": [
-                "IH T",
-                "IH T",
-                "IH T",
-                "IH T",
-                "IH T"
-           ]
-        },
-        {
-            "text": "bear",
-            "total": [4.4, 7.6, 10, 3.6, 6.8],
-            "accuracy": [3, 7, 10, 2, 6],
-            "stress": [10, 10, 10, 10, 10],
-            "phones": [
-                "B (EH) R",
-                "B EH {R}",
-                "B AR",
-                "B (AR)",
-                "B EH [L] R"
-           ]
-        }
-   ]
+    "000010011": {
+
+        "text": "WE CALL IT BEAR",
+        "accuracy": [7.0, 9.0, 8.0, 8.0, 9.0],
+        "completeness": [1.0, 1.0, 1.0, 1.0, 1.0],
+        "fluency": [10.0, 9.0, 8.0, 8.0, 10.0],
+        "prosodic": [10.0, 9.0, 7.0, 8.0, 9.0],
+        "total": [7.6, 9.0, 7.9, 8.0, 9.1],
+        "words": [
+            {
+                "accuracy": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "stress": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "total": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "text": "WE",
+                "ref-phones": "W IY0",
+                "phones": ["W IY0", "W IY0", "W IY0", "W IY0", "W IY0"]
+            },
+            {
+                "accuracy": [10.0, 8.0, 10.0, 10.0, 8.0],
+                "stress": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "total": [10.0, 8.4, 10.0, 10.0, 8.4],
+                "text": "CALL",
+                "ref-phones": "K AO0 L",
+                "phones": ["K AO0 L", "K {AO0} L", "K AO0 L", "K AO0 L", "K AO0 {L}"],
+            },
+            {
+                "accuracy": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "stress": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "total": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "text": "IT",
+                "ref-phones": "IH0 T",
+                "phones": ["IH0 T", "IH0 T", "IH0 T", "IH0 T", "IH0 T"]
+            },
+            {
+                "accuracy": [3.0, 7.0, 10.0, 2.0, 6.0],
+                "stress": [10.0, 10.0, 10.0, 10.0, 10.0],
+                "phones": ["B (EH0) (R)", "B {EH0} {R}", "B EH0 R", "B (EH0) (R)", "B EH0 [L] R"],
+                "total": [4.4, 7.6, 10.0, 3.6, 6.8],
+                "text": "BEAR",
+                "ref-phones": "B EH0 R"
+            }
+        ],
+    },
+    ...
 }
 ```
 
-The phoneme-level scores are notated in the following convenient notation:
+In `scores-detail.json`, the phoneme-level scores are notated in the following convenient notation:
 
 * for score 2, do not use any symbol
 * for score 1, use "{}" symbol