{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Untitled1.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "M1paAz06QjM1" }, "source": [ " import math\n", "from sklearn.utils import shuffle\n", "import matplotlib.pyplot as plt\n", "from mpl_toolkits.mplot3d import Axes3D\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.neural_network import MLPClassifier\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.decomposition import PCA\n", "from sklearn.model_selection import cross_val_score\n", "import hyperopt as hp\n", "from hyperopt import Trials, fmin, STATUS_OK\n", "import xgboost as xgb\n", "\n", "\n", "\n", "# 1. Load external data\n", "# 1.1. Raw molecular descriptor data of all components\n", "descriptors = pd.read_csv('Chem.csv', index_col=0)\n", "# 1.2. Co-crystal combinations and characterisation result\n", "results = pd.read_csv('Results03_.csv')\n", "\n", "# 2. Descriptor data cleaning\n", "# 2.1. Create final data frame and drop columns with missing values\n", "descriptor_set = descriptors.dropna(axis=1)\n", "# 2.2. Drop columns with only a single unique value over all rows\n", "non_unique = descriptor_set.apply(pd.Series.nunique)\n", "cols_to_drop = non_unique[non_unique == 1].index\n", "descriptor_set = descriptor_set.drop(cols_to_drop, axis=1)\n", "# 2.3. Exporting the actual descriptor_set\n", "# descriptor_set.to_csv('descriptor_set.csv')\n", "\n", "# 3. Create final experimental matrix containing descriptors and results\n", "# 3.1. Column descriptors CC_5 for final matrix\n", "CC_1 = descriptor_set.columns + '_1'\n", "CC_2 = descriptor_set.columns + '_2'\n", "CC_3 = ['Result']\n", "CC_4 = CC_1.union(CC_2, sort=False)\n", "CC_5 = CC_4.union(CC_3, sort=False)\n", "\n", "# 3.2. Empty array\n", "CC_e = []\n", "np.random.seed(1)\n", "# 3.3. Fill empty array with Component 1, Component 2 descriptors and the result (per experiment)\n", "for i in range(len(results)):\n", " A = np.array(descriptor_set.loc[results.iloc[i][0]])\n", " B = np.array(descriptor_set.loc[results.iloc[i][1]])\n", " RS = [A, B]\n", " C = np.concatenate(shuffle(RS), axis=0)\n", " D = [results.iloc[i][-1]]\n", " E = np.concatenate((C, D), axis=0)\n", " CC_e.append(E)\n", "# 3.4. Transform array in data frame to obtain final matrix\n", "CC = pd.DataFrame(CC_e, columns=CC_5)\n", "# 4. Normalize data and apply PCA (as well as t-SNE)\n", "# 4.1. Normalize the descriptors\n", "x = CC.loc[:, CC_4].values\n", "x = StandardScaler().fit_transform(x)\n", "sep = int((x.shape[1]*0.5))\n", "x1 = (x[:, :sep])\n", "x2 = (x[:, sep:])\n", "\n", "# 4.2. Normalized data has a mean of 0 and standard deviation of 1\n", "# print('The mean of the normalized data is {} and the standard deviation is {}.'.format(np.mean(x), np.std(x)))\n", "\n", "# 4.4. Apply PCA\n", "\n", "pca_components = 9\n", "CC_pca1 = PCA(n_components=pca_components)\n", "CC_princomp1 = CC_pca1.fit_transform(x1)\n", "CC_principal_df1 = pd.DataFrame(data=CC_princomp1, columns=['PC_1_' + str(i + 1) for i in range(pca_components)])\n", "# 4.5. Visualization of higher dimensional space\n", "# print('Explained variation per principal component: {}'.format(CC_pca1.explained_variance_ratio_))\n", "Lost1 = (100*(1-sum(CC_pca1.explained_variance_ratio_)))\n", "nb_des1 = x1.shape[1]\n", "nb_com1 = len(CC_pca1.explained_variance_ratio_)\n", "print('{}% of the information is lost by reducing {} features to {}.'.format(Lost1, nb_des1, nb_com1))\n", "expl = CC_pca1.explained_variance_ratio_\n", "#print(expl)\n", "loadings = pd.DataFrame(\n", " data=CC_pca1.components_.T * np.sqrt(CC_pca1.explained_variance_),\n", " index = descriptor_set.columns\n", ")\n", "loadings.to_csv('loadings.csv')\n", "\n", "\n", "CC_pca2 = PCA(n_components=pca_components)\n", "CC_princomp2 = CC_pca2.fit_transform(x2)\n", "CC_principal_df2 = pd.DataFrame(data=CC_princomp2, columns=['PC_2_' + str(i + 1) for i in range(pca_components)])\n", "# 4.5. Visualization of higher dimensional space\n", "# print('Explained variation per principal component: {}'.format(CC_pca2.explained_variance_ratio_))\n", "Lost2 = (100*(1-sum(CC_pca2.explained_variance_ratio_)))\n", "nb_des2 = x2.shape[1]\n", "nb_com2 = len(CC_pca2.explained_variance_ratio_)\n", "print('{}% of the information is lost by reducing {} features to {}.'.format(Lost2, nb_des2, nb_com2))\n", "\n", "# 5. Set up machine learning data frame and randomize and separate CC in training and test data\n", "# 5.1. Create the final data frame\n", "CC_principal_together = pd.concat([CC_principal_df1, CC_principal_df2], axis=1)\n", "CC_df = pd.concat([CC_principal_together, CC.iloc[:, -1]], axis=1)\n", "\n", "# 5.2.Start randomizing\n", "# 5.2.1. Set random seed so that shuffle is the same very time\n", "np.random.seed(1)\n", "# 5.2.2. Put rows into random order: permute index randomly and reindex data frame with that result\n", "CC_df = CC_df.reindex(np.random.permutation(CC_df.index))\n", "\n", "list_mean = list()\n", "list_std = list()\n", "# Defining the objective function\n", "def objective(params, n_folds=5):\n", " # Converting pandas data frame into xgboost format\n", " d_train = xgb.DMatrix(CC_df.iloc[:, :-1], CC_df.iloc[:, -1])\n", "\n", " # Running cross validation on your xgboost model\n", " cv_results = xgb.cv(params, d_train, nfold=n_folds, num_boost_round=500,\n", " early_stopping_rounds=25, metrics='auc', seed=0)\n", " # returns the loss on validation set\n", " auc = max(cv_results['test-auc-mean'])\n", " list_mean.append(auc)\n", " std = cv_results.iloc[-1][-1]\n", " list_std.append(std)\n", " return 1-auc\n", "\n", "\n", "# Defining the search space\n", "xgb_space = {\n", " # Do not print warning for not included estimators\n", " 'verbosity': 0,\n", " # max_depth: maximum depth allowed for every tree growth\n", " 'max_depth': hp.hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),\n", " # subsample: maximum allowed rows for every tree\n", " 'subsample': hp.hp.quniform('subsample', 0.2, 1.0, 0.05),\n", " # n_estimators: number of trees you want to build\n", " 'n_estimators': hp.hp.quniform('n_estimators', 2, 20, 1),\n", " # colsample_bytree: maximum allowed features for every tree\n", " 'colsample_bytree': hp.hp.quniform('colsample_bytree', 0.5, 1.0, 0.05),\n", " # min_child-weight: minimum number of instances required in each node\n", " 'min_child_weight': hp.hp.uniform('min_child_weight', 1, 20),\n", " # reg_alpha: L1 regularisation term on weights\n", " 'reg_alpha': hp.hp.uniform('reg_alpha', 0.0, 1.0),\n", " # reg_lambda: L2 regularisation term on weights\n", " 'reg_lambda': hp.hp.uniform('reg_lambda', 0.0, 1.0)\n", " }\n", "\n", "# Storing the results of every iteration\n", "trials = Trials()\n", "MAX_EVALS = 100\n", "\n", "# Optimize\n", "best = fmin(fn=objective, space=xgb_space, algo=hp.tpe.suggest,\n", " max_evals=MAX_EVALS, trials=trials)\n", "print(best)\n", "print(max(list_mean), list_std[list_mean.index(max(list_mean))])\n", "\n", "\n", "#\n", "#\n", "clf = MLPClassifier()\n", "clf.fit(CC_df.iloc[80:, 0:-1], CC_df.iloc[80:, -1])\n", "predictions = clf.predict(CC_df.iloc[0:79, 0:-1])\n", "auc1 = roc_auc_score(CC_df.iloc[0:79, -1], predictions)\n", "print(auc1)\n", "scores = cross_val_score(clf, CC_df.iloc[:, 0:-1], CC_df.iloc[:, -1], cv=5)\n", "mean = (scores[0]+scores[1]+scores[2]+scores[3]+scores[4])/5\n", "print(scores,mean)\n", "#importance = clf.feature_importances_\n", "#print(importance)\n", "#\n", "# # Last. Model Evaluation metrics\n", "# print('Accuracy Score: ' + str(accuracy_score(Paratest_df.iloc[0:36, -1],predictions)))\n", "# print('Precision Score: ' + str(precision_score(Paratest_df.iloc[0:36, -1],predictions)))\n", "#\n", "# # Last.1. Logistic Regression Classifier Confusion matrix\n", "# print('Confusion Matrix: \\n' + str(confusion_matrix(Paratest_df.iloc[0:36, -1],predictions)))\n", "\n" ], "execution_count": null, "outputs": [] } ] }