# FILTER WARNINGS

In [None]:
# filter warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

# IMPORT LIBRARIES

Go to https://keras.io/api/applications/ and check the neccesary imports to work with VGG16: VGG16, image and preprocess_input

In [None]:
# Import libraries from keras. Import VGG16
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

# Other Keras imports
from keras.models import Model

# Other Imports
from sklearn.preprocessing import LabelEncoder
import numpy as np
import h5py
import os
import datetime
import time

#imports for the testing phase
from __future__ import print_function

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import sklearn.metrics
from sklearn import svm
import statistics as st
import matplotlib.pyplot as plt
plt.switch_backend('agg')

# google colab
from google.colab import drive # Necessary to access data stored in google drive

# DEFINITION OF DIRECTORIES



The data will be stored in Google Drive, so that we can access it easily from Google Colab. Therefore, we must mount Google Drive and define the directories where the images are stored within Google Drive.

It is __very important__ that this notebook and the "Images_Intact_Damaged_Inserts" folder with the data are placed in the folder Colab_Notebooks, in the root of Google Drive. If you are using a different directory, change the code accodingly.

In [None]:
mount_dir = '/content/gdrive'
drive.mount(mount_dir)

In [None]:
# %% 0 - Definition of directories
#base_dir = os.path.join(mount_dir, 'MyDrive', 'Colab_Notebooks')
base_dir = '/content/gdrive/MyDrive/Universidad/Docencia/Proyecto EIT HEI/2023_EIT-HEI_REGINNA_40/4_SummerSchools/01_202307_Summer_School_July2023/2_Material_Courses/2_Use_case_CNN/'
images_dir = os.path.join(base_dir, 'Images_Intact_Damaged_Inserts')

# AUXILIAR FUNCTIONS

### Function to check if a directory exists

In [None]:
def check_if_directory_exists(name_folder):
    """
    check_if_directory_exists(name_folder)
    INPUT:
        name_folder: name of the directory to be checked
    OUTPUT:
        a message indicating that the directory does not exist and if it is created

    @author: Eduardo Fidalgo (EFF)
    """
    import os
    if not os.path.exists(name_folder):
        print(name_folder + " directory does not exist, created")
        os.makedirs(name_folder)
    else:
        print(name_folder + " directory exists, no action performed")

# 1. EXTRACT FEATURES

Our dataset is called as the folder: Images_Intact_Damaged_Inserts.

Check again https://keras.io/api/applications/ to look for the name of the model and weights.

Set a test set of 30% of the images (0.3). The rest (70%) will be used for training.

Set the size of the inserts, 224x224 pixels.

In the same webpage, check how to expand the dimensions, preprocess the input and compute the predictions of the model.

In [None]:
# To work with multiple datasets. Introduce the name of your dataset(s)
datasets_available = ["Images_Intact_Damaged_Inserts"]

# datasets to be analyzed
dataset = datasets_available

for ds in range(0, len(dataset)):
    # Configuration variables config variables
    model_name    = "vgg16"
    weights       = "imagenet"
    include_top   = 0
    train_path    = os.path.join(base_dir, dataset[ds])
    features_path = "output/" + dataset[ds] + "/" + model_name + "/features.h5"
    labels_path   = "output/" + dataset[ds] + "/" + model_name + "/labels.h5"
    results       = "output/" + dataset[ds] + "/" + model_name + "/results" + dataset[ds] + ".txt"
    model_path    = "output/" + dataset[ds] + "/" + model_name + "/model"
    # Set a test set of 30% of the images (0.3). The rest (70%) will be used for training.
    test_size = 0.3

    # check if the output directory exists, if not, create it.
    check_if_directory_exists("output")

    # check if the output directory exists, if not, create it.
    check_if_directory_exists("output/" + dataset[ds])

    # start time
    print ("[STATUS] start time - {}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))
    start = time.time()

    # Create the pretrained models
    if model_name == "vgg16":
      base_model = VGG16(weights=weights)
      model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)
      # Set the size of the inserts, 224x224 pixels.
      image_size = (224, 224)
    else:
      base_model = None

    # check if the output directory exists, if not, create it.
    check_if_directory_exists("output/" + dataset[ds] + "/" + model_name)

    print ("[INFO] successfully loaded base model and model...")

    # path to training dataset
    train_labels = os.listdir(train_path)

    # encode the labels
    print ("[INFO] encoding labels...")
    le = LabelEncoder()
    le.fit([tl for tl in train_labels])

    # variables to hold features and labels
    features = []
    labels   = []

    # loop over all the labels in the folder
    count = 1
    for i, label in enumerate(train_labels):
      cur_path = train_path + "/" + label
       # check how many files are, together with their extensions
      list_files = os.listdir(cur_path)
      count = 1
      #for image_path in glob.glob(cur_path + "/*.jpg"):
      for image_path in range(0, len(list_files)):
        print ("[INFO] Processing - " + str(count) +
               " named " + list_files[image_path])
        img = image.load_img(cur_path + "/" +
                             list_files[image_path], target_size=image_size)
        x = image.img_to_array(img)
        # expand the dimensions
        x = np.expand_dims(x, axis=0)
        # preprocess the input
        x = preprocess_input(x)
        # compute the prediction of the model
        feature = model.predict(x)
        flat = feature.flatten()
        features.append(flat)
        labels.append(label)
        print ("[INFO] processed for image - " + list_files[image_path])
        count += 1
      print ("[INFO] completed label - " + label)

    # encode the labels using LabelEncoder
    le = LabelEncoder()
    le_labels = le.fit_transform(labels)

    # get the shape of training labels
    print ("[STATUS] training labels: {}".format(le_labels))
    print ("[STATUS] training labels shape: {}".format(le_labels.shape))

    # save features and labels
    try:
        h5f_data = h5py.File(features_path, 'w')
    except:
        a=1;

    h5f_data.create_dataset('dataset_1', data=np.array(features))

    h5f_label = h5py.File(labels_path, 'w')
    h5f_label.create_dataset('dataset_1', data=np.array(le_labels))

    h5f_data.close()
    h5f_label.close()

    # save model and weights
    model_json = model.to_json()
    with open(model_path + str(test_size) + ".json", "w") as json_file:
      json_file.write(model_json)

    # save weights
    model.save_weights(model_path + str(test_size) + ".h5")
    print("[STATUS] saved model and weights to disk..")
    print ("[STATUS] features and labels saved..")

    # end time
    end = time.time()
    print ("[STATUS] end time - {}".
           format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M")))

# 2. TESTING PHASE

Set a test set of 30% of the images (0.3). The rest (70%) will be used for training.

Evaluate the model on the test data

Compute the accuracy and the recall using scikit-learn (https://scikit-learn.org/stable/modules/model_evaluation.html)

In [None]:
# Testing set. Set a test set of 30% of the images (0.3). The rest (70%) will be used for training.
test_set = [0.30]

for ds in range(0, len(dataset)):
    # Run 5 times the experimentation and compute the average
    avg_accuracy = []
    avg_recall = []
    avg_results = []

    for exp in list(range(1,6)):
        print("Running experiment " +  str(exp) + " for dataset ")
        # config variables
        model_name    = "vgg16"
        weights       = "imagenet"
        include_top   = 0
        train_path    = dataset[ds]
        features_path = "output/" + dataset[ds] + "/" + model_name + "/features.h5"
        labels_path   = "output/" + dataset[ds] + "/" + model_name + "/labels.h5"
        test_size     = test_set[ds]
        results       = "output/" + dataset[ds] + "/" + model_name + "/results" + dataset[ds] + ".txt"
        model_path    = "output/" + dataset[ds] + "/" + model_name + "/model"
        classifier_path = "output/" + dataset[ds] + "/" + model_name + "/classifier.pickle"

        # import features and labels
        h5f_data  = h5py.File(features_path, 'r')
        h5f_label = h5py.File(labels_path, 'r')

        features_string = h5f_data['dataset_1']
        labels_string   = h5f_label['dataset_1']

        features = np.array(features_string)
        labels   = np.array(labels_string)

        h5f_data.close()
        h5f_label.close()


        # verify the shape of features and labels
        print ("[INFO] features shape: {}".format(features.shape))
        print ("[INFO] labels shape: {}".format(labels.shape))

        print ("[INFO] training started...")
        # split the training and testing data
        (trainData, testData, trainLabels, testLabels) = train_test_split(
                np.array(features), np.array(labels),
                test_size=test_size, random_state=exp)

        print ("[INFO] splitted train and test data...")
        print ("[INFO] train data  : {}".format(trainData.shape))
        print ("[INFO] test data   : {}".format(testData.shape))
        print ("[INFO] train labels: {}".format(trainLabels.shape))
        print ("[INFO] test labels : {}".format(testLabels.shape))

        # Use SVM as the model
        print ("[INFO] creating model...")
        #model = LogisticRegression(random_state=seed)
        model = svm.SVC(kernel='linear',
                        probability=True, class_weight='balanced')
        model.fit(trainData, trainLabels)

        # use rank-1 and rank-5 predictions
        print ("[INFO] evaluating model...")
        rank_1 = 0
        rank_5 = 0

        # loop over test data
        for (label, features) in zip(testLabels, testData):
          # predict the probability of each class label and
          # take the top-5 class labels
          predictions = model.predict_proba(np.atleast_2d(features))[0]
          predictions = np.argsort(predictions)[::-1][:5]

          # rank-1 prediction increment
          if label == predictions[0]:
            rank_1 += 1

          # rank-5 prediction increment
          if label in predictions:
            rank_5 += 1

        # convert accuracies to percentages
        rank_1 = (rank_1 / float(len(testLabels))) * 100
        rank_5 = (rank_5 / float(len(testLabels))) * 100

        # evaluate the model on the test data
        preds = model.predict(testData)

        # compute the accuracy (https://scikit-learn.org/stable/modules/model_evaluation.html)
        accuracy = sklearn.metrics.accuracy_score(testLabels, preds)
        accuracy = accuracy * 100
        avg_accuracy.append(accuracy)

        # compute the recall (https://scikit-learn.org/stable/modules/model_evaluation.html)
        recall = sklearn.metrics.recall_score(testLabels, preds,
                                              average='macro')
        recall = recall * 100
        avg_recall.append(recall)

        results_per_class = classification_report(testLabels, preds, digits=4)
        avg_results.append(results_per_class)

    f = open(results, "w")
    avg_acc = st.mean(avg_accuracy)
    std_acc = st.stdev(avg_accuracy)
    avg_rec = st.mean(avg_recall)
    std_rec = st.stdev(avg_recall)
    #avg_res = st.mean(avg_results)
    #std_res = st.mean(avg_results)
    # write the classification report to file
    f.write("Averaged Accuracy: {:.2f}%\n\n".format(avg_acc))
    f.write("Std deviation (Accuracy): {:.2f}%\n\n".format(std_acc))
    f.write("Averaged Recall: {:.2f}%\n\n".format(avg_rec))
    f.write("Std deviation (Recall): {:.2f}%\n\n".format(std_rec))


    # write the classification report to file
    # f.write("{}\n".format(avg_results))
    f.write("\n\n".join(avg_results))
    f.close()
