import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.densenet import DenseNet121
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model
from keras import backend as K
from keras.models import load_model
import util
The dataset used is ChestX-ray8 dataset which contains 108,948 frontal-view X-ray images of 32,717 unique patients.
The entire dataset is available here.
IMAGE_DIR
variable.The dataset includes a CSV file that provides the labels for each X-ray.
For the smaller dataset available locally, corresponding labels are gathered in these 3 files:
nih/train-small.csv
: 875 images from our dataset to be used for training.nih/valid-small.csv
: 109 images from our dataset to be used for validation.nih/test.csv
: 420 images from our dataset to be used for testing. This dataset has been annotated by consensus among four different radiologists for 5 of the 14 pathologies:
Consolidation
Edema
Effusion
Cardiomegaly
Atelectasis
train_df = pd.read_csv("nih/train-small.csv")
valid_df = pd.read_csv("nih/valid-small.csv")
test_df = pd.read_csv("nih/test.csv")
train_df.head()
labels = ['Cardiomegaly',
'Emphysema',
'Effusion',
'Hernia',
'Infiltration',
'Mass',
'Nodule',
'Atelectasis',
'Pneumothorax',
'Pleural_Thickening',
'Pneumonia',
'Fibrosis',
'Edema',
'Consolidation']
It is worth noting that our dataset contains multiple images for each patient. This could be the case, for example, when a patient has taken multiple X-ray images at different times during their hospital visits. In our data splitting, we have ensured that the split is done on the patient level so that there is no data "leakage" between the train, validation, and test datasets.
def check_for_leakage(df1, df2, patient_col):
'''
return true if there are any patients found in df1 and df2
'''
patients_in_both_groups = set(df1[patient_col]).intersection(df2[patient_col])
return len(patients_in_both_groups) > 0
if not check_for_leakage(train_df, test_df, 'PatientId') and not check_for_leakage(valid_df, test_df, 'PatientId'):
print("NO DATA LEAKAGE!")
else: print("DATA LEAKAGE FOUND!")
We use ImageDataGenerator for:
def get_train_generator(df, image_dir, x_col, y_cols, shuffle=True, batch_size=8, seed=1, target_w = 320, target_h = 320):
"""
Return generator for training set, normalizing using batch
statistics.
Args:
train_df (dataframe): dataframe specifying training data.
image_dir (str): directory where image files are held.
x_col (str): name of column in df that holds filenames.
y_cols (list): list of strings that hold y labels for images.
batch_size (int): images per batch to be fed into model during training.
seed (int): random seed.
target_w (int): final width of input images.
target_h (int): final height of input images.
Returns:
train_generator (DataFrameIterator): iterator over training set
"""
# normalize images
image_generator = ImageDataGenerator(
samplewise_center=True,
samplewise_std_normalization= True)
# flow from directory with specified batch size
# and target image size
generator = image_generator.flow_from_dataframe(
dataframe=df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=shuffle,
seed=seed,
target_size=(target_w,target_h))
return generator
Now we need to build a new generator for validation and testing data.
Why can't we use the same generator as for the training data?
Look back at the generator we wrote for the training data.
What we need to do is normalize incoming test data using the statistics computed from the training set.
def get_test_and_valid_generator(valid_df, test_df, train_df, image_dir, x_col, y_cols, sample_size=100, batch_size=8, seed=1, target_w = 320, target_h = 320):
"""
Return generator for validation set and test set using
normalization statistics from training set.
Args:
valid_df (dataframe): dataframe specifying validation data.
test_df (dataframe): dataframe specifying test data.
train_df (dataframe): dataframe specifying training data.
image_dir (str): directory where image files are held.
x_col (str): name of column in df that holds filenames.
y_cols (list): list of strings that hold y labels for images.
sample_size (int): size of sample to use for normalization statistics.
batch_size (int): images per batch to be fed into model during training.
seed (int): random seed.
target_w (int): final width of input images.
target_h (int): final height of input images.
Returns:
test_generator (DataFrameIterator) and valid_generator: iterators over test set and validation set respectively
"""
# get generator to sample dataset
raw_train_generator = ImageDataGenerator().flow_from_dataframe(
dataframe=train_df,
directory=IMAGE_DIR,
x_col="Image",
y_col=labels,
class_mode="raw",
batch_size=sample_size,
shuffle=True,
target_size=(target_w, target_h))
# use sample to fit mean and std for test set generator
image_generator = ImageDataGenerator(
featurewise_center=True,
featurewise_std_normalization= True)
# fit generator to sample from training data
batch = raw_train_generator.next()
data_sample = batch[0]
image_generator.fit(data_sample)
# get test generator
valid_generator = image_generator.flow_from_dataframe(
dataframe=valid_df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=False,
seed=seed,
target_size=(target_w,target_h))
test_generator = image_generator.flow_from_dataframe(
dataframe=test_df,
directory=image_dir,
x_col=x_col,
y_col=y_cols,
class_mode="raw",
batch_size=batch_size,
shuffle=False,
seed=seed,
target_size=(target_w,target_h))
return valid_generator, test_generator
With our generator function ready, let's make one generator for our training data and one each of our test and validation datasets.
IMAGE_DIR = "nih/images-small/"
train_generator = get_train_generator(train_df, IMAGE_DIR, "Image", labels)
valid_generator, test_generator= get_test_and_valid_generator(valid_df, test_df, train_df, IMAGE_DIR, "Image", labels)
x, y = train_generator.next()
plt.imshow(x[0]);
Now we'll move on to model training and development. We have a few practical challenges to deal with before actually training a neural network, though. The first is class imbalance.
One of the challenges with working with medical diagnostic datasets is the large class imbalance present in such datasets. Let's plot the frequency of each of the labels in our dataset:
plt.xticks(rotation=90)
plt.bar(x=labels, height=np.mean(train_generator.labels, axis=0))
plt.title("Frequency of Each Class")
plt.show()
We can see from this plot that the prevalance of positive cases varies significantly across the different pathologies. (These trends mirror the ones in the full dataset as well.)
Hernia
pathology has the greatest imbalance with the proportion of positive training cases being about 0.2%. Infiltration
pathology, which has the least amount of imbalance, has only 17.5% of the training cases labelled positive.Ideally, we would train our model using an evenly balanced dataset so that the positive and negative training cases would contribute equally to the loss.
If we use a normal cross-entropy loss function with a highly unbalanced dataset, as we are seeing here, then the algorithm will be incentivized to prioritize the majority class (i.e negative in our case), since it contributes more to the loss.
Let's take a closer look at this. Assume we would have used a normal cross-entropy loss for each pathology. We recall that the cross-entropy loss contribution from the $i^{th}$ training data case is:
$$\mathcal{L}_{cross-entropy}(x_i) = -(y_i \log(f(x_i)) + (1-y_i) \log(1-f(x_i))),$$where $x_i$ and $y_i$ are the input features and the label, and $f(x_i)$ is the output of the model, i.e. the probability that it is positive.
Note that for any training case, either $y_i=0$ or else $(1-y_i)=0$, so only one of these terms contributes to the loss (the other term is multiplied by zero, and becomes zero).
We can rewrite the overall average cross-entropy loss over the entire training set $\mathcal{D}$ of size $N$ as follows:
$$\mathcal{L}_{cross-entropy}(\mathcal{D}) = - \frac{1}{N}\big( \sum_{\text{positive examples}} \log (f(x_i)) + \sum_{\text{negative examples}} \log(1-f(x_i)) \big).$$Using this formulation, we can see that if there is a large imbalance with very few positive training cases, for example, then the loss will be dominated by the negative class. Summing the contribution over all the training cases for each class (i.e. pathological condition), we see that the contribution of each class (i.e. positive or negative) is:
$$freq_{p} = \frac{\text{number of positive examples}}{N} $$$$\text{and}$$$$freq_{n} = \frac{\text{number of negative examples}}{N}.$$Complete the function below to calculate these frequences for each label in our dataset.
def compute_class_freqs(labels):
"""
Compute positive and negative frequences for each class.
Args:
labels (np.array): matrix of labels, size (num_examples, num_classes)
Returns:
positive_frequencies (np.array): array of positive frequences for each
class, size (num_classes)
negative_frequencies (np.array): array of negative frequences for each
class, size (num_classes)
"""
# total number of patients (rows)
N = labels.shape[0]
positive_frequencies = np.sum(labels==1, axis=0) / N
negative_frequencies = np.sum(labels==0, axis=0) / N
return positive_frequencies, negative_frequencies
Now we'll compute frequencies for our training data.
freq_pos, freq_neg = compute_class_freqs(train_generator.labels)
freq_pos
Let's visualize these two contribution ratios next to each other for each of the pathologies:
data = pd.DataFrame({"Class": labels, "Label": "Positive", "Value": freq_pos})
data = data.append([{"Class": labels[l], "Label": "Negative", "Value": v} for l,v in enumerate(freq_neg)], ignore_index=True)
plt.xticks(rotation=90)
f = sns.barplot(x="Class", y="Value", hue="Label" ,data=data)
As we see in the above plot, the contributions of positive cases is significantly lower than that of the negative ones. However, we want the contributions to be equal. One way of doing this is by multiplying each example from each class by a class-specific weight factor, $w_{pos}$ and $w_{neg}$, so that the overall contribution of each class is the same.
To have this, we want
$$w_{pos} \times freq_{p} = w_{neg} \times freq_{n},$$which we can do simply by taking
$$w_{pos} = freq_{neg}$$$$w_{neg} = freq_{pos}$$This way, we will be balancing the contribution of positive and negative labels.
pos_weights = freq_neg
neg_weights = freq_pos
pos_contribution = freq_pos * pos_weights
neg_contribution = freq_neg * neg_weights
Let's verify this by graphing the two contributions next to each other again:
data = pd.DataFrame({"Class": labels, "Label": "Positive", "Value": pos_contribution})
data = data.append([{"Class": labels[l], "Label": "Negative", "Value": v}
for l,v in enumerate(neg_contribution)], ignore_index=True)
plt.xticks(rotation=90)
sns.barplot(x="Class", y="Value", hue="Label" ,data=data);
As the above figure shows, by applying these weightings the positive and negative labels within each class would have the same aggregate contribution to the loss function. Now let's implement such a loss function.
After computing the weights, our final weighted loss for each training case will be
$$\mathcal{L}_{cross-entropy}^{w}(x) = - (w_{p} y \log(f(x)) + w_{n}(1-y) \log( 1 - f(x) ) ).$$def get_weighted_loss(pos_weights, neg_weights, epsilon=1e-7):
"""
Return weighted loss function given negative weights and positive weights.
Args:
pos_weights (np.array): array of positive weights for each class, size (num_classes)
neg_weights (np.array): array of negative weights for each class, size (num_classes)
Returns:
weighted_loss (function): weighted loss function
"""
def weighted_loss(y_true, y_pred):
"""
Return weighted loss value.
Args:
y_true (Tensor): Tensor of true labels, size is (num_examples, num_classes)
y_pred (Tensor): Tensor of predicted labels, size is (num_examples, num_classes)
Returns:
loss (Float): overall scalar loss summed across all classes
"""
# initialize loss to zero
loss = 0.0
for i in range(len(pos_weights)):
# for each class, add average weighted loss for that class
loss += K.mean(pos_weights[i]*y_true[:,i]*-K.log(y_pred[:,i]+epsilon) + neg_weights[i]*(1-y_true[:,i])*-K.log(1-y_pred[:,i]+epsilon) ) #complete this line
return loss
return weighted_loss
Next, we will use a pre-trained DenseNet121 model which we can load directly from Keras and then add two layers on top of it:
GlobalAveragePooling2D
layer to get the average of the last convolution layers from DenseNet121.Dense
layer with sigmoid
activation to get the prediction logits for each of our classes.We can set our custom loss function for the model by specifying the loss
parameter in the compile()
function.
# create the base pre-trained model
base_model = DenseNet121(weights='./nih/densenet.hdf5', include_top=False)
x = base_model.output
# add a global spatial average pooling layer
x = GlobalAveragePooling2D()(x)
# and a logistic layer
predictions = Dense(len(labels), activation="sigmoid")(x)
model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam', loss=get_weighted_loss(pos_weights, neg_weights))
The model ready for training using model.fit()
this is a model (with the same architecture) trained on the original dataset (which is 40GB+), which takes long hours to train.
model.load_weights("./nih/pretrained_model.h5")
predicted_vals = model.predict_generator(test_generator, steps = len(test_generator))
auc_rocs = util.get_roc_curve(labels, predicted_vals, test_generator)
You can compare the performance to the AUCs reported in the original ChexNeXt paper in the table below:
For reference, here's the AUC figure from the ChexNeXt paper which includes AUC values for their model as well as radiologists on this dataset:
This method does take advantage of a few other tricks such as self-training and ensembling as well, which can give a significant boost to the performance.
First we will load the small training set and setup to look at the 4 classes with the highest performing AUC measures.
df = pd.read_csv("nih/train-small.csv")
# only show the labels with top 4 AUC
labels_to_show = np.take(labels, np.argsort(auc_rocs)[::-1])[:4]
Now let's look at a few specific images.
util.compute_gradcam(model, '00008270_015.png', IMAGE_DIR, df, labels, labels_to_show)
util.compute_gradcam(model, '00011355_002.png', IMAGE_DIR, df, labels, labels_to_show)
util.compute_gradcam(model, '00029855_001.png', IMAGE_DIR, df, labels, labels_to_show)
util.compute_gradcam(model, '00005410_000.png', IMAGE_DIR, df, labels, labels_to_show)