utils_lung.py

import dicom
import SimpleITK as sitk
import numpy as np
import csv
import os
from collections import defaultdict
import cPickle as pickle
import glob
import utils


def read_pkl(path):
    d = pickle.load(open(path, "rb"))
    return d['pixel_data'], d['origin'], d['spacing']

def read_mhd(path):
    itk_data = sitk.ReadImage(path.encode('utf-8'))
    pixel_data = sitk.GetArrayFromImage(itk_data)
    origin = np.array(list(reversed(itk_data.GetOrigin())))
    spacing = np.array(list(reversed(itk_data.GetSpacing())))
    return pixel_data, origin, spacing


def world2voxel(world_coord, origin, spacing):
    stretched_voxel_coord = np.absolute(world_coord - origin)
    voxel_coord = stretched_voxel_coord / spacing
    return voxel_coord


def read_dicom(path):
    d = dicom.read_file(path)
    metadata = {}
    for attr in dir(d):
        if attr[0].isupper() and attr != 'PixelData':
            try:
                metadata[attr] = getattr(d, attr)
            except AttributeError:
                pass

    metadata['InstanceNumber'] = int(metadata['InstanceNumber'])
    metadata['PixelSpacing'] = np.float32(metadata['PixelSpacing'])
    metadata['ImageOrientationPatient'] = np.float32(metadata['ImageOrientationPatient'])
    try:
        metadata['SliceLocation'] = np.float32(metadata['SliceLocation'])
    except:
        metadata['SliceLocation'] = None
    metadata['ImagePositionPatient'] = np.float32(metadata['ImagePositionPatient'])
    metadata['Rows'] = int(metadata['Rows'])
    metadata['Columns'] = int(metadata['Columns'])
    metadata['RescaleSlope'] = float(metadata['RescaleSlope'])
    metadata['RescaleIntercept'] = float(metadata['RescaleIntercept'])
    return np.array(d.pixel_array), metadata


def extract_pid_dir(patient_data_path):
    return patient_data_path.split('/')[-1]


def extract_pid_filename(file_path, replace_str='.mhd'):
    return os.path.basename(file_path).replace(replace_str, '').replace('.pkl', '')


def get_candidates_paths(path):
    id2candidates_path = {}
    file_paths = sorted(glob.glob(path + '/*.pkl'))
    for p in file_paths:
        pid = extract_pid_filename(p, '.pkl')
        id2candidates_path[pid] = p
    return id2candidates_path


def get_patient_data(patient_data_path):
    slice_paths = os.listdir(patient_data_path)
    sid2data = {}
    sid2metadata = {}
    for s in slice_paths:
        slice_id = s.split('.')[0]
        data, metadata = read_dicom(patient_data_path + '/' + s)
        sid2data[slice_id] = data
        sid2metadata[slice_id] = metadata
    return sid2data, sid2metadata


def ct2HU(x, metadata):
    x = metadata['RescaleSlope'] * x + metadata['RescaleIntercept']
    x[x < -1000] = -1000
    return x


def read_dicom_scan(patient_data_path):
    sid2data, sid2metadata = get_patient_data(patient_data_path)
    sid2position = {}
    for sid in sid2data.keys():
        sid2position[sid] = get_slice_position(sid2metadata[sid])
    sids_sorted = sorted(sid2position.items(), key=lambda x: x[1])
    sids_sorted = [s[0] for s in sids_sorted]
    z_pixel_spacing = []
    for s1, s2 in zip(sids_sorted[1:], sids_sorted[:-1]):
        z_pixel_spacing.append(sid2position[s1] - sid2position[s2])
    z_pixel_spacing = np.array(z_pixel_spacing)
    try:
        assert np.all((z_pixel_spacing - z_pixel_spacing[0]) < 0.01)
    except:
        print 'This patient has multiple series, we will remove one'
        sids_sorted_2 = []
        for s1, s2 in zip(sids_sorted[::2], sids_sorted[1::2]):
            if sid2metadata[s1]["InstanceNumber"] > sid2metadata[s2]["InstanceNumber"]:
                sids_sorted_2.append(s1)
            else:
                sids_sorted_2.append(s2)
        sids_sorted = sids_sorted_2
        z_pixel_spacing = []
        for s1, s2 in zip(sids_sorted[1:], sids_sorted[:-1]):
            z_pixel_spacing.append(sid2position[s1] - sid2position[s2])
        z_pixel_spacing = np.array(z_pixel_spacing)
        assert np.all((z_pixel_spacing - z_pixel_spacing[0]) < 0.01)

    pixel_spacing = np.array((z_pixel_spacing[0],
                              sid2metadata[sids_sorted[0]]['PixelSpacing'][0],
                              sid2metadata[sids_sorted[0]]['PixelSpacing'][1]))

    img = np.stack([ct2HU(sid2data[sid], sid2metadata[sid]) for sid in sids_sorted])

    return img, pixel_spacing


def sort_slices_position(patient_data):
    return sorted(patient_data, key=lambda x: get_slice_position(x['metadata']))


def sort_sids_by_position(sid2metadata):
    return sorted(sid2metadata.keys(), key=lambda x: get_slice_position(sid2metadata[x]))


def sort_slices_jonas(sid2metadata):
    sid2position = slice_location_finder(sid2metadata)
    return sorted(sid2metadata.keys(), key=lambda x: sid2position[x])


def get_slice_position(slice_metadata):
    """
    https://www.kaggle.com/rmchamberlain/data-science-bowl-2017/dicom-to-3d-numpy-arrays
    """
    orientation = tuple((o for o in slice_metadata['ImageOrientationPatient']))
    position = tuple((p for p in slice_metadata['ImagePositionPatient']))
    rowvec, colvec = orientation[:3], orientation[3:]
    normal_vector = np.cross(rowvec, colvec)
    slice_pos = np.dot(position, normal_vector)
    return slice_pos


def slice_location_finder(sid2metadata):
    """
    :param slicepath2metadata: dict with arbitrary keys, and metadata values
    :return:
    """

    sid2midpix = {}
    sid2position = {}

    for sid in sid2metadata:
        metadata = sid2metadata[sid]
        image_orientation = metadata["ImageOrientationPatient"]
        image_position = metadata["ImagePositionPatient"]
        pixel_spacing = metadata["PixelSpacing"]
        rows = metadata['Rows']
        columns = metadata['Columns']

        # calculate value of middle pixel
        F = np.array(image_orientation).reshape((2, 3))
        # reversed order, as per http://nipy.org/nibabel/dicom/dicom_orientation.html
        i, j = columns / 2.0, rows / 2.0
        im_pos = np.array([[i * pixel_spacing[0], j * pixel_spacing[1]]], dtype='float32')
        pos = np.array(image_position).reshape((1, 3))
        position = np.dot(im_pos, F) + pos
        sid2midpix[sid] = position[0, :]

    if len(sid2midpix) <= 1:
        for sp, midpix in sid2midpix.iteritems():
            sid2position[sp] = 0.
    else:
        # find the keys of the 2 points furthest away from each other
        max_dist = -1.0
        max_dist_keys = []
        for sp1, midpix1 in sid2midpix.iteritems():
            for sp2, midpix2 in sid2midpix.iteritems():
                if sp1 == sp2:
                    continue
                distance = np.sqrt(np.sum((midpix1 - midpix2) ** 2))
                if distance > max_dist:
                    max_dist_keys = [sp1, sp2]
                    max_dist = distance
        # project the others on the line between these 2 points
        # sort the keys, so the order is more or less the same as they were
        # max_dist_keys.sort(key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
        p_ref1 = sid2midpix[max_dist_keys[0]]
        p_ref2 = sid2midpix[max_dist_keys[1]]
        v1 = p_ref2 - p_ref1
        v1 /= np.linalg.norm(v1)

        for sp, midpix in sid2midpix.iteritems():
            v2 = midpix - p_ref1
            sid2position[sp] = np.inner(v1, v2)

    return sid2position


def get_patient_data_paths(data_dir):
    pids = sorted(os.listdir(data_dir))
    return [data_dir + '/' + p for p in pids]

def read_patient_annotations_luna(pid, directory):
    return pickle.load(open(os.path.join(directory,pid+'.pkl'),"rb"))


def read_labels(file_path):
    id2labels = {}
    train_csv = open(file_path)
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        id, label = item.replace('\n', '').split(',')
        id2labels[id] = int(label)
    return id2labels


def read_test_labels(file_path):
    id2labels = {}
    train_csv = open(file_path)
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        id, label = item.replace('\n', '').split(';')
        id2labels[id] = int(label)
    return id2labels


def read_luna_annotations(file_path):
    id2xyzd = defaultdict(list)
    train_csv = open(file_path)
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        id, x, y, z, d = item.replace('\n', '').split(',')
        id2xyzd[id].append([float(z), float(y), float(x), float(d)])
    return id2xyzd


def read_luna_negative_candidates(file_path):
    id2xyzd = defaultdict(list)
    train_csv = open(file_path)
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        id, x, y, z, d = item.replace('\n', '').split(',')
        if float(d) == 0:
            id2xyzd[id].append([float(z), float(y), float(x), float(d)])
    return id2xyzd


def write_submission(pid2prediction, submission_path):
    """
    :param pid2prediction: dict of {patient_id: label}
    :param submission_path:
    """
    f = open(submission_path, 'w+')
    fo = csv.writer(f, lineterminator='\n')
    fo.writerow(['id', 'cancer'])
    for pid in pid2prediction.keys():
        fo.writerow([pid, pid2prediction[pid]])
    f.close()


def filter_close_neighbors(candidates, min_dist=16):
    #TODO pixelspacing should be added , it is now hardcoded 
    candidates_wo_dupes = set()
    no_pairs = 0
    for can1 in candidates:
        found_close_candidate = False
        swap_candidate = None
        for can2 in candidates_wo_dupes:
            if (can1 == can2).all():
                raise "Candidate should not be in the target array yet"
            else:
                delta = can1[:3] - can2[:3]
                delta[0] = 2.5*delta[0] #zyx coos
                dist = np.sum(delta**2)**(1./2)
                if dist<min_dist:
                    no_pairs += 1
                    print 'Warning: there is a pair nodules close together',  can1[:3], can2[:3]
                    found_close_candidate = True
                    if can1[4]>can2[4]:
                        swap_candidate = can2
                    break
        if not found_close_candidate:
            candidates_wo_dupes.add(tuple(can1))
        elif swap_candidate:
            candidates_wo_dupes.remove(swap_candidate)
            candidates_wo_dupes.add(tuple(can1))
    print 'n candidates filtered out', no_pairs
    return candidates_wo_dupes

def dice_index(predictions, targets, epsilon=1e-12):
    predictions = np.asarray(predictions).flatten()
    targets = np.asarray(targets).flatten()
    dice = (2. * np.sum(targets * predictions) + epsilon) / (np.sum(predictions) + np.sum(targets) + epsilon)
    return dice


def cross_entropy(predictions, targets, epsilon=1e-12):
    predictions = np.asarray(predictions).flatten()
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    targets = np.asarray(targets).flatten()
    ce = np.mean(np.log(predictions) * targets + np.log(1 - predictions) * (1. - targets))
    return ce


def get_generated_pids(predictions_dir):
    pids = []
    if os.path.isdir(predictions_dir):
        pids = os.listdir(predictions_dir)
        pids = [extract_pid_filename(p) for p in pids]
    return pids

def evaluate_log_loss(pid2prediction, pid2label):
    predictions, labels = [], []
    assert set(pid2prediction.keys()) == set(pid2label.keys())
    for k, v in pid2prediction.iteritems():
        predictions.append(v)
        labels.append(pid2label[k])
    return log_loss(labels, predictions)


def log_loss(y_real, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    y_real = np.array(y_real)
    losses = y_real * np.log(y_pred) + (1 - y_real) * np.log(1 - y_pred)
    return - np.average(losses)


def read_luna_properties(file_path):
    id2xyzp = defaultdict(list)
    train_csv = open(file_path)
    lines = train_csv.readlines()
    i = 0
    for item in lines:
        if i == 0:
            i = 1
            continue
        annotation = item.replace('\n', '').split(',')
        id = annotation[0]
        x = float(annotation[1])
        y = float(annotation[2])
        z = float(annotation[3])
        d = float(annotation[4])
        properties_dict = {
            'diameter': d,
            'calcification': float(annotation[5]),
            'internalStructure': float(annotation[6]),
            'lobulation': float(annotation[7]),
            'malignancy': float(annotation[8]),
            'margin': float(annotation[9]),
            'sphericity': float(annotation[10]),
            'spiculation': float(annotation[11]),
            'subtlety': float(annotation[12]),
            'texture': float(annotation[13]),
        }

        id2xyzp[id].append([z, y, x, d, properties_dict])
    return id2xyzp