examples/qabot/qabot_data.py

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

import numpy as np
import random

download_dir = "/tmp/"
import os
import urllib


def check_exist_or_download(url):
    ''' download data into tmp '''
    name = url.rsplit('/', 1)[-1]
    filename = os.path.join(download_dir, name)
    if not os.path.isfile(filename):
        print("Downloading %s" % url)
        urllib.request.urlretrieve(url, filename)
    return filename


def unzip_data(download_dir, data_zip):
    data_dir = download_dir + "insuranceQA-master/V2/"
    if not os.path.exists(data_dir):
        print("extracting %s to %s" % (download_dir, data_dir))
        from zipfile import ZipFile
        with ZipFile(data_zip, 'r') as zipObj:
            zipObj.extractall(download_dir)
    return data_dir


def get_label2answer(data_dir):
    import gzip
    label2answer = dict()
    with gzip.open(data_dir +
                   "/InsuranceQA.label2answer.token.encoded.gz") as fin:
        for line in fin:
            pair = line.decode().strip().split("\t")
            idxs = pair[1].split(" ")
            idxs = [int(idx.replace("idx_", "")) for idx in idxs]
            label2answer[int(pair[0])] = idxs
    return label2answer


pad_idx = 0
pad_string = "<pad>"
pad_embed = np.zeros((300,))

insuranceqa_train_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
insuranceqa_test_filename = "/InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
insuranceQA_url = "https://github.com/shuzi/insuranceQA/archive/master.zip"
insuranceQA_cache_fp = download_dir + "insuranceQA_cache.pickle"
google_news_pretrain_embeddings_link = "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


def get_idx2word(data_dir):
    idx2word = dict()
    with open(data_dir + "vocabulary", encoding="utf-8") as vc_f:
        for line in vc_f:
            pair = line.strip().split("\t")
            idx = int(pair[0].replace("idx_", ""))
            idx2word[idx] = pair[1]

    # add padding string to idx2word lookup
    idx2word[pad_idx] = pad_string

    return idx2word


def get_train_raw(data_dir, data_filename):
    ''' deserialize training data file
        args:
            data_dir: dir of data file
        return:
            train_raw: list of QnA pair, length of list  == number of samples,
                each pair has 3 fields:
                    0 is question sentence idx encoded, use idx2word to decode,
                        idx2vec to get embedding.
                    1 is ans labels, each label corresponds to a ans sentence,
                        use label2answer to decode.
                    2 is top K candidate ans, these are negative ans for
                        training.
    '''
    train_raw = []
    import gzip
    with gzip.open(data_dir + data_filename) as fin:
        for line in fin:
            tpl = line.decode().strip().split("\t")
            question = [
                int(idx.replace("idx_", "")) for idx in tpl[1].split(" ")
            ]
            ans = [int(label) for label in tpl[2].split(" ")]
            candis = [int(label) for label in tpl[3].split(" ")]
            train_raw.append((question, ans, candis))
    return train_raw


def limit_encode_train(train_raw, label2answer, idx2word, q_seq_limit,
                       ans_seq_limit, idx2vec):
    ''' prepare train data to embedded word vector sequence given sequence limit
        return:
            questions_encoded: np ndarray, shape
                (number samples, seq length, vector size)
            poss_encoded: same layout, sequence for positive answer
            negs_encoded: same layout, sequence for negative answer
    '''
    questions = [question for question, answers, candis in train_raw]
    # choose 1 answer from answer pool
    poss = [
        label2answer[random.choice(answers)]
        for question, answers, candis in train_raw
    ]
    # choose 1 candidate from candidate pool
    negs = [
        label2answer[random.choice(candis)]
        for question, answers, candis in train_raw
    ]

    # filtered word not in idx2vec
    questions_filtered = [
        [idx for idx in q if idx in idx2vec] for q in questions
    ]
    poss_filtered = [[idx for idx in ans if idx in idx2vec] for ans in poss]
    negs_filtered = [[idx for idx in ans if idx in idx2vec] for ans in negs]

    # crop to seq limit
    questions_crop = [
        q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
        for q in questions_filtered
    ]
    poss_crop = [
        ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
        for ans in poss_filtered
    ]
    negs_crop = [
        ans[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(ans))
        for ans in negs_filtered
    ]

    # encoded, word idx to word vector
    questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
    poss_encoded = [[idx2vec[idx] for idx in ans] for ans in poss_crop]
    negs_encoded = [[idx2vec[idx] for idx in ans] for ans in negs_crop]

    # make nd array
    questions_encoded = np.array(questions_encoded).astype(np.float32)
    poss_encoded = np.array(poss_encoded).astype(np.float32)
    negs_encoded = np.array(negs_encoded).astype(np.float32)
    return questions_encoded, poss_encoded, negs_encoded


def get_idx2vec_weights(wv, idx2word):
    idx2vec = {k: wv[v] for k, v in idx2word.items() if v in wv}

    # add padding embedding (all zeros) to idx2vec lookup
    idx2vec[pad_idx] = pad_embed
    return idx2vec


def prepare_data(use_cache=True):
    import pickle
    if not os.path.isfile(insuranceQA_cache_fp) or not use_cache:
        # no cache is found, preprocess data from scratch
        print("prepare data from scratch")

        # get pretained word vector
        from gensim.models.keyedvectors import KeyedVectors
        google_news_pretrain_fp = check_exist_or_download(
            google_news_pretrain_embeddings_link)
        wv = KeyedVectors.load_word2vec_format(google_news_pretrain_fp,
                                               binary=True)

        # prepare insurance QA dataset
        data_zip = check_exist_or_download(insuranceQA_url)
        data_dir = unzip_data(download_dir, data_zip)

        label2answer = get_label2answer(data_dir)
        idx2word = get_idx2word(data_dir)
        idx2vec = get_idx2vec_weights(wv, idx2word)

        train_raw = get_train_raw(data_dir, insuranceqa_train_filename)
        test_raw = get_train_raw(data_dir, insuranceqa_test_filename)
        with open(insuranceQA_cache_fp, 'wb') as handle:
            pickle.dump((train_raw, test_raw, label2answer, idx2word, idx2vec),
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)
    else:
        # load from cached pickle
        with open(insuranceQA_cache_fp, 'rb') as handle:
            (train_raw, test_raw, label2answer, idx2word,
             idx2vec) = pickle.load(handle)

    return train_raw, test_raw, label2answer, idx2word, idx2vec


def limit_encode_eval(train_raw,
                      label2answer,
                      idx2word,
                      q_seq_limit,
                      ans_seq_limit,
                      idx2vec,
                      top_k_candi_limit=6):
    ''' prepare train data to embedded word vector sequence given sequence limit for testing
        return:
            questions_encoded: np ndarray, shape
                (number samples, seq length, vector size)
            poss_encoded: same layout, sequence for positive answer
            negs_encoded: same layout, sequence for negative answer
    '''
    questions = [question for question, answers, candis in train_raw]

    # combine truth and candidate answers label,
    candi_pools = [
        list(answers + candis)[:top_k_candi_limit]
        for question, answers, candis in train_raw
    ]
    assert all([len(pool) == top_k_candi_limit for pool in candi_pools])

    ans_count = [len(answers) for question, answers, candis in train_raw]
    assert all([c > 0 for c in ans_count])

    # encode ans
    candi_pools_encoded = [[label2answer[candi_label]
                            for candi_label in pool]
                           for pool in candi_pools]

    # filtered word not in idx2vec
    questions_filtered = [
        [idx for idx in q if idx in idx2vec] for q in questions
    ]
    candi_pools_filtered = [[[idx
                              for idx in candi_encoded
                              if idx in idx2vec]
                             for candi_encoded in pool]
                            for pool in candi_pools_encoded]

    # crop to seq limit
    questions_crop = [
        q[:q_seq_limit] + [0] * max(0, q_seq_limit - len(q))
        for q in questions_filtered
    ]
    candi_pools_crop = [[
        candi[:ans_seq_limit] + [0] * max(0, ans_seq_limit - len(candi))
        for candi in pool
    ]
                        for pool in candi_pools_filtered]

    # encoded, word idx to word vector
    questions_encoded = [[idx2vec[idx] for idx in q] for q in questions_crop]
    candi_pools_encoded = [[[idx2vec[idx]
                             for idx in candi]
                            for candi in pool]
                           for pool in candi_pools_crop]
    questions_encoded = np.array(questions_encoded).astype(np.float32)
    candi_pools_encoded = np.array(candi_pools_encoded).astype(np.float32)

    # candi_pools_encoded shape
    #    (number of sample QnA,
    #     number of candi in pool,
    #     number of sequence word idx per candi,
    #     300 word embedding for 1 word idx)
    #  e.g 10 QnA to test
    #      5 each question has 5 possible ans
    #      8 each ans has 8 words
    #      300 each word has vector size 300
    return questions_encoded, candi_pools_encoded, ans_count