{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "CoAmo2D.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "P5bly-NzWICq" }, "source": [ "# **Amorphization prediction by using 2D molecule representations**\n", "\n", "In this article, a keras based prediction of amorphization experiments is coded." ] }, { "cell_type": "markdown", "metadata": { "id": "RiwAhNr2Wfsa" }, "source": [ "Load all necessary packages" ] }, { "cell_type": "code", "metadata": { "id": "clw-RenvSvqo" }, "source": [ "%tensorflow_version 2.x\n", "import tensorflow as tf\n", "from tensorflow.keras import datasets, layers, models\n", "from keras.preprocessing import image\n", "from keras.preprocessing.image import load_img, ImageDataGenerator\n", "import warnings\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from PIL import Image\n", "import pandas as pd\n", "from sklearn.utils import shuffle" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Dhfuw0FnW9KS" }, "source": [ "Connect to google drive" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DzNOG_RSlCA4", "outputId": "823fd347-e988-4a00-b924-e58ab1eee354" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "lxtOlZQaW_1i" }, "source": [ "Name all experimentally used molecules and upload their 2D picutres (RGB and 500x500) " ] }, { "cell_type": "code", "metadata": { "id": "48TwMD5ikPJ_" }, "source": [ "molecules = ['2-Nitrobenzamide.png','2-Nitrobenzoicacid.png',\"3,3'-Thiodipropionicacid.png\",'3-Fluorobenzamide.png',\n", " '3-Methoxybenzoicacid.png','4-Aminobenzoicacid.png','4-Hydroxybenzamide.png','4-Methylbenzamide.png','4-Nitrobenzamide.png',\n", " 'Trans-Aconiticacid.png','Anthranilicacid.png','L-Ascorbicacid.png','Acetylsalicylicacid.png','Benzoicacid.png',\"4,4'-Bipyridine.png\",\n", " 'Caffeine.png','Carbamazepine.png','Citricacid.png','Diclofenac.png','Folicacid.png','L-Glutamicacid.png',\n", " 'Ibuprofene.png','L-Isoleucine.png','Isonicotinamide.png','L-Leucine.png','Mesaconicacid.png',\n", " 'Nicotinamide.png','Phenazine.png','Riboflavine.png','Salicylicacid.png','Theobromine.png','Theophylline.png','3-Methylbenzamide.png']\n", "\n", "im = []\n", "for i in molecules:\n", " dir = ('drive/MyDrive/Colab Notebooks/Molecules/'+ i)\n", " im1 = load_img(dir)\n", " im2 = im1.resize((100,100))\n", " im.append(im2)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "dwelgCNGXXkb" }, "source": [ "\n", "Let's look at some images and check if everything is in place. By chaning the IMG_INDEX we can look at the different molecules." ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 285 }, "id": "IdUZEM1Poozj", "outputId": "69ec794d-246e-41d0-c5c1-945694ec7ed7" }, "source": [ "IMG_INDEX = 20\n", "print(molecules[IMG_INDEX])\n", "plt.imshow(im[IMG_INDEX] ,cmap=plt.cm.binary)\n", "plt.show()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "L-Glutamicacid.png\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "izshiWOwXlVT" }, "source": [ "Upload the result matrix" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LVNWVTbgqPrR", "outputId": "253ec922-e6c1-4ac5-9c3f-70acea877b88" }, "source": [ "results = pd.read_csv('drive/MyDrive/Colab Notebooks/Molecules/Results.csv')\n", "print(results)\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " Component1 Component2 Outcome\n", "0 3,3'-Thiodipropionicacid 4,4'-Bipyridine 1\n", "1 3,3'-Thiodipropionicacid Acetylsalicylicacid 0\n", "2 3,3'-Thiodipropionicacid Anthranilicacid 0\n", "3 3,3'-Thiodipropionicacid Benzoicacid 0\n", "4 3,3'-Thiodipropionicacid Caffeine 0\n", ".. ... ... ...\n", "433 Folicacid 3-Methoxybenzoicacid 0\n", "434 Folicacid 2-Nitrobenzamide 1\n", "435 Folicacid 4-Nitrobenzamide 1\n", "436 Folicacid 3-Methylbenzamide 0\n", "437 Folicacid 4-Methylbenzamide 0\n", "\n", "[438 rows x 3 columns]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "NOhpwNlfXpIi" }, "source": [ "Define the function that combines both molecules horizontally." ] }, { "cell_type": "code", "metadata": { "id": "XiY2VMeL7Cuq" }, "source": [ "def get_concat_h(im1, im2):\n", " dst = Image.new('RGB', (im1.width + im2.width, im1.height))\n", " dst.paste(im1, (0, 0))\n", " dst.paste(im2, (im1.width, 0))\n", " return dst" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "W6qi6ToCXzv6" }, "source": [ "Execute the function on the result matrix." ] }, { "cell_type": "code", "metadata": { "id": "fcpcZIMttJ70" }, "source": [ "CC = []\n", "for i in range(len(results)):\n", " A = im[molecules.index(results.iloc[i][0]+'.png')]\n", " B = im[molecules.index(results.iloc[i][1]+'.png')]\n", " C = shuffle([A,B])\n", " AB = get_concat_h(C[0],C[1])\n", " pix_arr = np.array(AB) / 255.0\n", " CC.append(pix_arr)\n", "CC = tf.stack(CC)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "JyuYmcsht_11" }, "source": [ "Let's look at some images and check if everything is in place. By chaning the IMG_INDEX we can look at the different molecule pairs" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 235 }, "id": "hXb6a7TJt_BY", "outputId": "d945cf52-3105-4f7f-db54-b502264a3d6f" }, "source": [ "IMG_INDEX = 20\n", "print(results.iloc[IMG_INDEX][0]+' and '+results.iloc[IMG_INDEX][1])\n", "plt.imshow(CC[IMG_INDEX] ,cmap=plt.cm.binary)\n", "plt.show()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "3,3'-Thiodipropionicacid and 3-Methoxybenzoicacid\n" ], "name": "stdout" }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "markdown", "metadata": { "id": "NZi5zGpo9iAo" }, "source": [ "Define classes and the labels of the results" ] }, { "cell_type": "code", "metadata": { "id": "ZJry3pzh9lsP" }, "source": [ "classes = ['1','0']\n", "CC_labels = results['Outcome']\n", "CC_labels = tf.stack(CC_labels)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "KFkPx9E79xNQ" }, "source": [ "Building the **convolutional base**" ] }, { "cell_type": "code", "metadata": { "id": "oqZw50ODATHy" }, "source": [ "model = models.Sequential()\n", "model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(100,200, 3)))\n", "model.add(layers.MaxPooling2D((2, 2)))\n", "model.add(layers.Conv2D(64, (3, 3), activation='relu'))\n", "model.add(layers.MaxPooling2D((2, 2)))\n", "model.add(layers.Conv2D(64, (3, 3), activation='relu'))" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "bvLKK6f3AoHR" }, "source": [ "Let's have a look at the model so far\n" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iW4gTEj0Ae7J", "outputId": "841bb40c-dbc1-416f-e0f7-cdfdc9850875" }, "source": [ "model.summary()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "conv2d (Conv2D) (None, 98, 198, 32) 896 \n", "_________________________________________________________________\n", "max_pooling2d (MaxPooling2D) (None, 49, 99, 32) 0 \n", "_________________________________________________________________\n", "conv2d_1 (Conv2D) (None, 47, 97, 64) 18496 \n", "_________________________________________________________________\n", "max_pooling2d_1 (MaxPooling2 (None, 23, 48, 64) 0 \n", "_________________________________________________________________\n", "conv2d_2 (Conv2D) (None, 21, 46, 64) 36928 \n", "=================================================================\n", "Total params: 56,320\n", "Trainable params: 56,320\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "wFBqfYXoA9nC" }, "source": [ "Adding **dense layer**" ] }, { "cell_type": "code", "metadata": { "id": "tujNlpkrAnCL" }, "source": [ "model.add(layers.Flatten())\n", "model.add(layers.Dense(64, activation='relu'))\n", "model.add(layers.Dense(2))" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "jCHyXxcyBJni" }, "source": [ "Training" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dcpeTbXQBL1K", "outputId": "ae813c52-e955-41e6-e8da-4c81b4aa7ce4" }, "source": [ "model.compile(optimizer='adam',\n", " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", " metrics=['accuracy'])\n", "\n", "history = model.fit(CC, CC_labels, epochs=1)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "14/14 [==============================] - 10s 647ms/step - loss: 1.1864 - accuracy: 0.5525\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "EsygqiNSuBti" }, "source": [ "Augmentation" ] }, { "cell_type": "code", "metadata": { "id": "Jcbj9DesfsJP" }, "source": [ "datagen = ImageDataGenerator(\n", "rotation_range=40,\n", "width_shift_range=0.2,\n", "height_shift_range=0.2,\n", "shear_range=0.2,\n", "zoom_range=0.2,\n", "horizontal_flip=True,\n", "fill_mode='nearest')\n", "\n", "CC_Aug = []\n", "CC_Aug_labels = []\n", "\n", "for k in range(len(results)):\n", " test_img = CC[k]\n", " img = image.img_to_array(test_img)\n", " img = img.reshape((1,) + img.shape) \n", " i = 0\n", " for batch in datagen.flow(img):\n", " img = tf.reshape(img,CC[0].shape)\n", " CC_Aug.append(img)\n", " CC_Aug_labels.append(CC_labels[k])\n", " i += 1\n", " if i > 4:\n", " break" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "v5AQnqZikyI3" }, "source": [ "CC_new = tf.concat([CC,CC_Aug],0)\n", "CC_labels_new = tf.concat([CC_labels,CC_Aug_labels],-1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ARKA6DOlrspF" }, "source": [ "CC_new = tf.random.shuffle(CC_new,1)\n", "CC_labels_new = tf.random.shuffle(CC_labels_new,1)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "K33BJzn1jKDu" }, "source": [ "model = models.Sequential()\n", "model.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(100,200, 3)))\n", "model.add(layers.MaxPooling2D((2, 2)))\n", "model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", "model.add(layers.MaxPooling2D((2, 2)))\n", "model.add(layers.Conv2D(128, (3, 3), activation='relu'))\n", "model.add(layers.Flatten())\n", "model.add(layers.Dense(256, activation='relu'))\n", "model.add(layers.Dense(2))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0fg-qUbaseEl", "outputId": "c01e761a-4977-4ed3-c70e-645be8300d45" }, "source": [ "model.compile(optimizer='adam',\n", " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", " metrics=['accuracy'])\n", "\n", "history = model.fit(CC_new, CC_labels_new, epochs=5)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/5\n", "83/83 [==============================] - 151s 2s/step - loss: 0.7944 - accuracy: 0.5845\n", "Epoch 2/5\n", "83/83 [==============================] - 150s 2s/step - loss: 0.6754 - accuracy: 0.6027\n", "Epoch 3/5\n", "83/83 [==============================] - 153s 2s/step - loss: 0.6782 - accuracy: 0.5982\n", "Epoch 4/5\n", "83/83 [==============================] - 154s 2s/step - loss: 0.6776 - accuracy: 0.6027\n", "Epoch 5/5\n", "83/83 [==============================] - 154s 2s/step - loss: 0.6748 - accuracy: 0.6012\n" ], "name": "stdout" } ] } ] }