ICILearn
/
learnopencv
mirror of https://github.com/spmallick/learnopencv.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
							from collections import namedtuple

import cv2
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.layers import (
    BatchNormalization,
    Conv2D,
)

from FullyConvolutionalResnet50 import fully_convolutional_resnet50

Rect = namedtuple("Rect", "x1 y1 x2 y2")


def backprop_receptive_field(
    image, predicted_class, scoremap, use_max_activation=False,
):
    model = fully_convolutional_resnet50(
        input_shape=(image.shape[-3:]), pretrained_resnet=False,
    )

    for module in model.layers:
        try:
            if isinstance(module, Conv2D):
                conv_weights = np.full(module.get_weights()[0].shape, 0.005)
                if len(module.get_weights()) > 1:
                    conv_biases = np.full(module.get_weights()[1].shape, 0.0)
                    module.set_weights([conv_weights, conv_biases])
                # cases when use_bias = False
                else:
                    module.set_weights([conv_weights])
            if isinstance(module, BatchNormalization):
                # weight sequence: gamma, beta, running mean, running variance
                bn_weights = [
                    module.get_weights()[0],
                    module.get_weights()[1],
                    np.full(module.get_weights()[2].shape, 0.0),
                    np.full(module.get_weights()[3].shape, 1.0),
                ]
                module.set_weights(bn_weights)
        except:
            pass

    input = tf.ones_like(image)
    out = model.predict(image)
    receptive_field_mask = tf.Variable(tf.zeros_like(out))

    if not use_max_activation:
        receptive_field_mask[:, :, :, predicted_class].assign(scoremap)
    else:
        scoremap_max_row_values = tf.math.reduce_max(scoremap, axis=1)
        max_row_id = tf.math.argmax(scoremap, axis=1)
        max_col_id = tf.math.argmax(scoremap_max_row_values, axis=1).numpy()[0]
        max_row_id = max_row_id[0, max_col_id].numpy()
        print(
            "Coords of the max activation:", max_row_id, max_col_id,
        )
        # update gradient
        receptive_field_mask = tf.tensor_scatter_nd_update(
            receptive_field_mask, [(0, max_row_id, max_col_id, 0)], [1],
        )

    grads = []
    with tf.GradientTape() as tf_gradient_tape:
        tf_gradient_tape.watch(input)
        # get the predictions
        preds = model(input)
        # apply the mask
        pseudo_loss = preds * receptive_field_mask
        pseudo_loss = K.mean(pseudo_loss)
        # get gradient
        grad = tf_gradient_tape.gradient(pseudo_loss, input)
        grad = tf.transpose(grad, perm=[0, 3, 1, 2])
        grads.append(grad)
    return grads[0][0, 0]


def find_rect(activations):
    # Dilate and erode the activations to remove grid-like artifacts
    kernel = np.ones((5, 5), np.uint8)
    activations = cv2.dilate(activations, kernel=kernel)
    activations = cv2.erode(activations, kernel=kernel)

    # Binarize the activations
    _, activations = cv2.threshold(activations, 0.65, 1, type=cv2.THRESH_BINARY)
    activations = activations.astype(np.uint8).copy()

    # Find the contour of the binary blob
    contours, _ = cv2.findContours(
        activations, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE,
    )

    # Find bounding box around the object.
    rect = cv2.boundingRect(contours[0])
    return Rect(rect[0], rect[1], rect[0] + rect[2], rect[1] + rect[3])


def normalize(activations):
    activations = activations - np.min(activations[:])
    activations = activations / np.max(activations[:])
    return activations


def visualize_activations(image, activations, show_bounding_rect=False):
    activations = normalize(activations)

    activations_multichannel = np.stack([activations, activations, activations], axis=2)
    masked_image = (image * activations_multichannel).astype(np.uint8)

    if show_bounding_rect:
        rect = find_rect(activations.numpy())
        cv2.rectangle(
            masked_image,
            (rect.x1, rect.y1),
            (rect.x2, rect.y2),
            color=(0, 0, 255),
            thickness=2,
        )

    return masked_image


def run_resnet_inference(original_image):

    # read ImageNet class ids to a list of labels
    with open("imagenet_classes.txt") as f:
        labels = [line.strip() for line in f.readlines()]

    # convert image to the RGB format
    image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)

    # pre-process image
    image = preprocess_input(image)

    # convert image to NCHW tf.tensor
    image = tf.expand_dims(image, 0)

    # load resnet50 model with pretrained ImageNet weights
    model = fully_convolutional_resnet50(input_shape=(image.shape[-3:]))

    # Perform inference.
    # Instead of a 1×1000 vector, we will get a
    # 1×1000×n×m output ( i.e. a probability map
    # of size n × m for each 1000 class,
    # where n and m depend on the size of the image).
    preds = model.predict(image)
    preds = tf.transpose(preds, perm=[0, 3, 1, 2])
    preds = tf.nn.softmax(preds, axis=1)
    print("Response map shape : ", preds.shape)

    # find class with the maximum score in the n × m output map
    pred = tf.math.reduce_max(preds, axis=1)
    class_idx = tf.math.argmax(preds, axis=1)

    row_max = tf.math.reduce_max(pred, axis=1)
    row_idx = tf.math.argmax(pred, axis=1)

    col_idx = tf.math.argmax(row_max, axis=1)

    predicted_class = tf.gather_nd(
        class_idx, (0, tf.gather_nd(row_idx, (0, col_idx[0])), col_idx[0]),
    )

    # print the top predicted class
    print("Predicted Class : ", labels[predicted_class], predicted_class)

    # find the n × m score map for the predicted class
    score_map = tf.expand_dims(preds[0, predicted_class, :, :], 0).numpy()
    print("Score Map shape : ", score_map.shape)

    # compute the receptive filed for max activation pixel
    receptive_field_max_activation = backprop_receptive_field(
        image,
        scoremap=score_map,
        predicted_class=predicted_class,
        use_max_activation=True,
    )
    # compute the receptive filed for the whole image
    receptive_field_image = backprop_receptive_field(
        image,
        scoremap=score_map,
        predicted_class=predicted_class,
        use_max_activation=False,
    )

    # resize score map to the original image size
    score_map = score_map[0]
    score_map = cv2.resize(
        score_map, (original_image.shape[1], original_image.shape[0]),
    )

    # display the images
    cv2.imshow("Original Image", original_image)
    cv2.imshow(
        "Score map: activations and bbox",
        visualize_activations(original_image, score_map),
    )
    cv2.imshow(
        "receptive_field_max_activation",
        visualize_activations(
            original_image, receptive_field_max_activation, show_bounding_rect=True,
        ),
    )
    cv2.imshow(
        "receptive_field_the_whole_image",
        visualize_activations(
            original_image, receptive_field_image, show_bounding_rect=True,
        ),
    )
    cv2.waitKey(0)


def main():
    # read the image
    image_path = "camel.jpg"
    image = cv2.imread(image_path)

    # run inference
    run_resnet_inference(image)


if __name__ == "__main__":
    main()