-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate-dataset.js
157 lines (135 loc) · 7.71 KB
/
create-dataset.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/**
* Run the multi-speaker.js first and then run this to append more samples to wavs folder and metadata.txt
*/
'use strict'
import fs from 'fs'
import path from 'path'
import async from 'async'
import {exec} from 'child_process'
import jsb from 'json-beautify'
import { normalizePrompt, normalizeText } from './common_functions.js'
const labelInputFolder = '/Users/janaka/node/tipitaka.lk/dev/audio', // also in '/Volumes/1TB/audio/final uploaded'
textInputFolder = '/Users/janaka/node/tipitaka.lk/public/static/text',
fileMapFile = '/Users/janaka/node/tipitaka.lk/public/static/data/file-map.json',
audioInputFolder = '/Volumes/1TB/audio/silence-added'
const minClipLength = 2, maxClipLength = 24
// TODO: ap- should not have any gatha, force set to para/default - but dhs was
const forceTypeNoGatha = /^(ap-dhs)/g, excludeFiles = /^ap-yam/
const fileMap = JSON.parse(fs.readFileSync(fileMapFile, 'utf-8'))
function loadLabelFile(file) {
return fs.readFileSync(path.join(labelInputFolder, file + '.txt'), 'utf-8').split('\n')
.filter(line => line.trim())
.map(line => line.trim().split('\t').map(p => Number(p)))
.map(([start, end, num]) => ({start, end, length: end - start, file, num}))
}
function splitWords(sinhala) { // anything outside sinhala range deleted
return sinhala.replace(/[^\u0D80-\u0DFF ]/g, '').split(' ').filter(w => w.length)
}
function getMedianWordFrequency(words) {
const sorted = words.map(w => wordCounts[w]).sort((a, b) => a - b)
const midI = Math.floor(sorted.length / 2)
return sorted.length % 2 ? sorted[midI] : (sorted[midI - 1] + sorted[midI]) / 2
}
const incCounter = (arr, prop) => arr[prop] = arr[prop] ? arr[prop] + 1 : 1
const wordCounts = {}, dedupList = {}
let totalLabels = 0, totalLength = 0
const usableEntries = []
Object.entries(fileMap).forEach(([textFile, labelFiles]) => {
if (excludeFiles.test(textFile)) return
const entries = [], labels = []
labelFiles.forEach(lf => labels.push(...loadLabelFile(lf)))
JSON.parse(fs.readFileSync(path.join(textInputFolder, textFile + '.json'), 'utf-8')).pages.forEach(page => entries.push(...page.pali.entries))
// join labels with entries, leaving only entries with labels
let labeled = entries.filter(entry => !entry.noAudio)
labeled.forEach((e, i) => {
e.label = labels[i]
e.textFile = textFile
})
labeled = labeled.filter(e => e.label) // some text files could be only partially recorded
if (forceTypeNoGatha.test(textFile)) labeled.forEach(e => e.type = (e.type == 'gatha') ? 'paragraph' : e.type)
// get only the labels of length within desired range
let usable = labeled.filter((e, i) => e.label && e.label.length > minClipLength && e.label.length <= maxClipLength)
usable.forEach(e => {
const text = normalizeText(e.text, e.type)
e = Object.assign(e, normalizePrompt(text, e.type))
e.words = splitWords(e.sinhala)
})
usable = usable.filter(e => { // remove any text that occured before
const key = e.words.join(' ')
if (dedupList[key]) return false
for (let word of e.words) incCounter(wordCounts, word) // give higher prob to rare words
return dedupList[key] = true
})
totalLength += labeled.reduce((acc, e) => acc + e.label.length, 0)
totalLabels += labeled.length
console.log(`file: ${textFile}, all labels: ${labeled.length}, usable labels: ${usable.length}`)
usableEntries.push(...usable)
})
// remove outliers and sort usable entries based on a score
usableEntries.forEach(e => {
e.score = getMedianWordFrequency(e.words)
e.lengthRatio = e.label.length / e.roman.replace(/h/g, '').length
})
const outliersToRemove = 200
const outlierRemoved = usableEntries.sort((a, b) => a.lengthRatio - b.lengthRatio).slice(outliersToRemove, -outliersToRemove)
outlierRemoved.sort((a, b) => a.score - b.score) // ascending order of the score
const outlierLength = outlierRemoved.reduce((acc, e) => acc + e.label.length, 0)
const requiredLength = 30 * 3600, // collect until this many hours are reached
wavFileOffset = 1000, // allow 1000 from the multi-speaker.js
maxAllowed = { gatha: 0.2, centered: 0.05, heading: 0.1 } // otherwise too many gatha/headings will be selected since they are the short entries
let collectedLength = 0, collectedCount = { total: 0, }
const canCollect = (type) => (maxAllowed[type] || 1) > (collectedCount[type] || 0) / (collectedCount.total || 1)
const usedEntries = outlierRemoved.filter((e, i) => {
if (!canCollect(e.type) || collectedLength > requiredLength) return false
collectedLength += e.label.length
collectedCount[e.type] = (collectedCount[e.type] || 0) + 1
collectedCount.total++
return true
})
usedEntries.sort((a, b) => a.lengthRatio - b.lengthRatio).forEach((e, i) => e.wavFile = 'pali_' + (i + wavFileOffset).toString().padStart(4, '0'))
const extractAudio = true
if (extractAudio) {
// extract content from audio files
// trim all silences more than 0.75 seconds, normalize and set rate (original flac is 44100)
const outputFolder = 'wavs', outputOptions = 'silence -l 1 0.1 1% -1 0.75 1% reverse silence 1 0.1 1% reverse rate 22050 norm -1' //rate 22050 before norm
// do not delete output folder since we have to append to existing samples from multi-speaker
const extractSegment = (e, callback) => { // Define the function that will extract a single segment
const inputFile = path.join(audioInputFolder, e.label.file + '.flac')
const command = `sox "${inputFile}" "${path.join(outputFolder, e.wavFile + '.wav')}" trim ${e.label.start} ${e.label.length.toFixed(2)} ${outputOptions}`;
exec(command, callback);
}
const startTime = new Date()
async.mapLimit(usedEntries, 7, (e, mapCallback) => {
extractSegment(e, (error, stdout, stderr) => mapCallback(error || null, stderr || stdout))
}, (error, results) => {
if (error) console.error(error)
console.log(`Extracted audio from flac files in ${((Date.now() - startTime) / 1000).toFixed(2)} seconds`);
})
}
// compute character and type counts
const charCountsRoman = {}, charCountsSinhala = {}, typeCounts = {}
usedEntries.forEach(e => {
for (let char of e.roman) incCounter(charCountsRoman, char)
for (let char of e.sinhala) incCounter(charCountsSinhala, char)
incCounter(typeCounts, e.type)
e.speaker = 'mettananda' // (e.type == 'gatha') ? 'gatha' : 'default'
})
const usedLength = usedEntries.reduce((acc, e) => acc + e.label.length, 0)
if (extractAudio) {
fs.writeFileSync('char-counts.tsv', Object.entries(charCountsRoman)
.sort((a, b) => b[1] - a[1])
.map(([char, count]) => char + '\t' + count)
.join('\n'), 'utf-8')
fs.appendFileSync('metadata.csv', '\n' + usedEntries.map(e => [e.wavFile, e.roman, e.sinhala, e.speaker, e.type].join('|')).join('\n'), 'utf8')
fs.writeFileSync('word-counts.tsv', Object.entries(wordCounts).map(([word, count]) => word + '\t' + count).join('\n'), 'utf-8')
fs.writeFileSync('text-entries.json', jsb(usedEntries, null, '\t', 100), 'utf-8')
}
console.log(`Max clip length: ${maxClipLength} seconds. Min clip length: ${minClipLength} seconds`)
const log = (stat, count, duration) => console.log(`${stat} labels => count: ${count}, length: ${(duration / 3600).toFixed(1)} hours, average length: ${(duration / count).toFixed(2)}`)
log('Total', totalLabels, totalLength)
log('Usable', outlierRemoved.length, outlierLength)
log('Used', usedEntries.length, usedLength)
console.log(typeCounts)
console.log(`characters="${Object.keys(charCountsRoman).sort().join('')}"`)
console.log(`characters="${Object.keys(charCountsSinhala).sort().join('')}"`)
console.log(`create dataset using "tar -cjf pali_dataset.tar.bz2 wavs metadata.csv"`)