Skip to content

Commit

Permalink
Remove deprecased tensorflow.contrib.learn.python.learn.datasets.base…
Browse files Browse the repository at this point in the history
….maybe_download

Fixes coqui-ai#1531
Fixes coqui-ai#1477
  • Loading branch information
Alexandre Lissy committed Oct 2, 2018
1 parent 8f62bec commit 94b8c5c
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 38 deletions.
26 changes: 3 additions & 23 deletions bin/import_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

import csv
import tarfile
import progressbar
import requests
import subprocess

from glob import glob
Expand All @@ -20,43 +18,25 @@
from multiprocessing.dummy import Pool
from multiprocessing import cpu_count

from util.downloader import maybe_download, SIMPLE_BAR

FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
SAMPLE_RATE = 16000
MAX_SECS = 10
ARCHIVE_DIR_NAME = 'cv_corpus_v1'
ARCHIVE_NAME = ARCHIVE_DIR_NAME + '.tar.gz'
ARCHIVE_URL = 'https://s3.us-east-2.amazonaws.com/common-voice-data-download/' + ARCHIVE_NAME

SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']

def _download_and_preprocess_data(target_dir):
# Making path absolute
target_dir = path.abspath(target_dir)
# Conditionally download data
archive_path = _maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
# Conditionally extract common voice data
_maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
# Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
_maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)

def _maybe_download(archive_name, target_dir, archive_url):
# If archive file does not exist, download it...
archive_path = path.join(target_dir, archive_name)
if not path.exists(archive_path):
print('No archive "%s" - downloading...' % archive_path)
req = requests.get(archive_url, stream=True)
total_size = int(req.headers.get('content-length', 0))
done = 0
with open(archive_path, 'wb') as f:
bar = progressbar.ProgressBar(max_value=total_size, widgets=SIMPLE_BAR)
for data in req.iter_content(1024*1024):
done += len(data)
f.write(data)
bar.update(done)
else:
print('Found archive "%s" - not downloading.' % archive_path)
return archive_path

def _maybe_extract(target_dir, extracted_data, archive_path):
# If target_dir/extracted_data does not exist, extract archive in target_dir
extracted_path = path.join(target_dir, extracted_data)
Expand Down
6 changes: 3 additions & 3 deletions bin/import_ldc93s1.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@

import pandas

from tensorflow.contrib.learn.python.learn.datasets import base
from util.downloader import maybe_download

def _download_and_preprocess_data(data_dir):
# Conditionally download data
LDC93S1_BASE = "LDC93S1"
LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
local_file = base.maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
trans_file = base.maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
local_file = maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
trans_file = maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
with open(trans_file, "r") as fin:
transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '')

Expand Down
16 changes: 8 additions & 8 deletions bin/import_librivox.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import unicodedata

from sox import Transformer
from tensorflow.contrib.learn.python.learn.datasets import base
from util.downloader import maybe_download
from tensorflow.python.platform import gfile

def _download_and_preprocess_data(data_dir):
Expand All @@ -34,21 +34,21 @@ def _download_and_preprocess_data(data_dir):
TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

def filename_of(x): return os.path.split(x)[1]
train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
bar.update(0)
train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
bar.update(1)
train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
bar.update(2)

dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
bar.update(3)
dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
bar.update(4)

test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
bar.update(5)
test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
bar.update(6)

# Conditionally extract LibriSpeech data
Expand Down
4 changes: 2 additions & 2 deletions bin/import_ted.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
from glob import glob
from os import makedirs, path, remove, rmdir
from sox import Transformer
from tensorflow.contrib.learn.python.learn.datasets import base
from util.downloader import maybe_download
from tensorflow.python.platform import gfile
from util.stm import parse_stm_file

def _download_and_preprocess_data(data_dir):
# Conditionally download data
TED_DATA = "TEDLIUM_release2.tar.gz"
TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)
local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL)

# Conditionally extract TED data
TED_DIR = "TEDLIUM_release2"
Expand Down
4 changes: 2 additions & 2 deletions bin/import_voxforge.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from os import makedirs, path
from bs4 import BeautifulSoup
from tensorflow.python.platform import gfile
from tensorflow.contrib.learn.python.learn.datasets import base
from util.downloader import maybe_download

"""The number of jobs to run in parallel"""
NUM_PARALLEL = 8
Expand Down Expand Up @@ -68,7 +68,7 @@ def download(d):
download_url = voxforge_url + '/' + file
c = counter.increment()
print('Downloading file {} ({}/{})...'.format(i+1, c, total))
base.maybe_download(filename_of(download_url), archive_dir, download_url)
maybe_download(filename_of(download_url), archive_dir, download_url)
return download

def _parallel_extracter(data_dir, number_of_test, number_of_dev, total, counter):
Expand Down
29 changes: 29 additions & 0 deletions util/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import requests
import progressbar

from os import path, makedirs

SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']

def maybe_download(archive_name, target_dir, archive_url):
# If archive file does not exist, download it...
archive_path = path.join(target_dir, archive_name)

if not path.exists(target_dir):
print('No path "%s" - creating ...' % target_dir)
makedirs(target_dir)

if not path.exists(archive_path):
print('No archive "%s" - downloading...' % archive_path)
req = requests.get(archive_url, stream=True)
total_size = int(req.headers.get('content-length', 0))
done = 0
with open(archive_path, 'wb') as f:
bar = progressbar.ProgressBar(max_value=total_size, widgets=SIMPLE_BAR)
for data in req.iter_content(1024*1024):
done += len(data)
f.write(data)
bar.update(done)
else:
print('Found archive "%s" - not downloading.' % archive_path)
return archive_path

0 comments on commit 94b8c5c

Please sign in to comment.