Remove deprecased tensorflow.contrib.learn.python.learn.datasets.base…

….maybe_download Fixes coqui-ai#1531 Fixes coqui-ai#1477
weipin · Oct 2, 2018 · 94b8c5c · 94b8c5c
1 parent 8f62bec
commit 94b8c5c
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 38 deletions.
diff --git a/bin/import_cv.py b/bin/import_cv.py
@@ -9,8 +9,6 @@
 
 import csv
 import tarfile
-import progressbar
-import requests
 import subprocess
 
 from glob import glob
@@ -20,43 +18,25 @@
 from multiprocessing.dummy import Pool
 from multiprocessing import cpu_count
 
+from util.downloader import maybe_download, SIMPLE_BAR
+
 FIELDNAMES = ['wav_filename', 'wav_filesize', 'transcript']
 SAMPLE_RATE = 16000
 MAX_SECS = 10
 ARCHIVE_DIR_NAME = 'cv_corpus_v1'
 ARCHIVE_NAME = ARCHIVE_DIR_NAME + '.tar.gz'
 ARCHIVE_URL = 'https://s3.us-east-2.amazonaws.com/common-voice-data-download/' + ARCHIVE_NAME
 
-SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']
-
 def _download_and_preprocess_data(target_dir):
     # Making path absolute
     target_dir = path.abspath(target_dir)
     # Conditionally download data
-    archive_path = _maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
+    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
     # Conditionally extract common voice data
     _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
     # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
     _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
 
-def _maybe_download(archive_name, target_dir, archive_url):
-    # If archive file does not exist, download it...
-    archive_path = path.join(target_dir, archive_name)
-    if not path.exists(archive_path):
-        print('No archive "%s" - downloading...' % archive_path)
-        req = requests.get(archive_url, stream=True)
-        total_size = int(req.headers.get('content-length', 0))
-        done = 0
-        with open(archive_path, 'wb') as f:
-            bar = progressbar.ProgressBar(max_value=total_size, widgets=SIMPLE_BAR)
-            for data in req.iter_content(1024*1024):
-                done += len(data)
-                f.write(data)
-                bar.update(done)
-    else:
-        print('Found archive "%s" - not downloading.' % archive_path)
-    return archive_path
-
 def _maybe_extract(target_dir, extracted_data, archive_path):
     # If target_dir/extracted_data does not exist, extract archive in target_dir
     extracted_path = path.join(target_dir, extracted_data)

diff --git a/bin/import_ldc93s1.py b/bin/import_ldc93s1.py
@@ -9,14 +9,14 @@
 
 import pandas
 
-from tensorflow.contrib.learn.python.learn.datasets import base
+from util.downloader import maybe_download
 
 def _download_and_preprocess_data(data_dir):
     # Conditionally download data
     LDC93S1_BASE = "LDC93S1"
     LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
-    local_file = base.maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
-    trans_file = base.maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
+    local_file = maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
+    trans_file = maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
     with open(trans_file, "r") as fin:
         transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '')
 

diff --git a/bin/import_librivox.py b/bin/import_librivox.py
@@ -16,7 +16,7 @@
 import unicodedata
 
 from sox import Transformer
-from tensorflow.contrib.learn.python.learn.datasets import base
+from util.downloader import maybe_download
 from tensorflow.python.platform import gfile
 
 def _download_and_preprocess_data(data_dir):
@@ -34,21 +34,21 @@ def _download_and_preprocess_data(data_dir):
         TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"
 
         def filename_of(x): return os.path.split(x)[1]
-        train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
+        train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
         bar.update(0)
-        train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
+        train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
         bar.update(1)
-        train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
+        train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
         bar.update(2)
 
-        dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
+        dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
         bar.update(3)
-        dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
+        dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
         bar.update(4)
 
-        test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
+        test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
         bar.update(5)
-        test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
+        test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
         bar.update(6)
 
     # Conditionally extract LibriSpeech data

diff --git a/bin/import_ted.py b/bin/import_ted.py
@@ -16,15 +16,15 @@
 from glob import glob
 from os import makedirs, path, remove, rmdir
 from sox import Transformer
-from tensorflow.contrib.learn.python.learn.datasets import base
+from util.downloader import maybe_download
 from tensorflow.python.platform import gfile
 from util.stm import parse_stm_file
 
 def _download_and_preprocess_data(data_dir):
     # Conditionally download data
     TED_DATA = "TEDLIUM_release2.tar.gz"
     TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
-    local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)
+    local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL)
 
     # Conditionally extract TED data
     TED_DIR = "TEDLIUM_release2"

diff --git a/bin/import_voxforge.py b/bin/import_voxforge.py
@@ -15,7 +15,7 @@
 from os import makedirs, path
 from bs4 import BeautifulSoup
 from tensorflow.python.platform import gfile
-from tensorflow.contrib.learn.python.learn.datasets import base
+from util.downloader import maybe_download
 
 """The number of jobs to run in parallel"""
 NUM_PARALLEL = 8
@@ -68,7 +68,7 @@ def download(d):
         download_url = voxforge_url + '/' + file
         c = counter.increment()
         print('Downloading file {} ({}/{})...'.format(i+1, c, total))
-        base.maybe_download(filename_of(download_url), archive_dir, download_url)
+        maybe_download(filename_of(download_url), archive_dir, download_url)
     return download
 
 def _parallel_extracter(data_dir, number_of_test, number_of_dev, total, counter):

diff --git a/util/downloader.py b/util/downloader.py
@@ -0,0 +1,29 @@
+import requests
+import progressbar
+
+from os import path, makedirs
+
+SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']
+
+def maybe_download(archive_name, target_dir, archive_url):
+    # If archive file does not exist, download it...
+    archive_path = path.join(target_dir, archive_name)
+
+    if not path.exists(target_dir):
+        print('No path "%s" - creating ...' % target_dir)
+        makedirs(target_dir)
+
+    if not path.exists(archive_path):
+        print('No archive "%s" - downloading...' % archive_path)
+        req = requests.get(archive_url, stream=True)
+        total_size = int(req.headers.get('content-length', 0))
+        done = 0
+        with open(archive_path, 'wb') as f:
+            bar = progressbar.ProgressBar(max_value=total_size, widgets=SIMPLE_BAR)
+            for data in req.iter_content(1024*1024):
+                done += len(data)
+                f.write(data)
+                bar.update(done)
+    else:
+        print('Found archive "%s" - not downloading.' % archive_path)
+    return archive_path