{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Untitled1.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "M1paAz06QjM1"
      },
      "source": [
        " import math\n",
        "from sklearn.utils import shuffle\n",
        "import matplotlib.pyplot as plt\n",
        "from mpl_toolkits.mplot3d import Axes3D\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from sklearn.neural_network import MLPClassifier\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.metrics import roc_auc_score\n",
        "from sklearn.decomposition import PCA\n",
        "from sklearn.model_selection import cross_val_score\n",
        "import hyperopt as hp\n",
        "from hyperopt import Trials, fmin, STATUS_OK\n",
        "import xgboost as xgb\n",
        "\n",
        "\n",
        "\n",
        "# 1. Load external data\n",
        "# 1.1. Raw molecular descriptor data of all components\n",
        "descriptors = pd.read_csv('Chem.csv', index_col=0)\n",
        "# 1.2. Co-crystal combinations and characterisation result\n",
        "results = pd.read_csv('Results03_.csv')\n",
        "\n",
        "# 2. Descriptor data cleaning\n",
        "# 2.1. Create final data frame and drop columns with missing values\n",
        "descriptor_set = descriptors.dropna(axis=1)\n",
        "# 2.2. Drop columns with only a single unique value over all rows\n",
        "non_unique = descriptor_set.apply(pd.Series.nunique)\n",
        "cols_to_drop = non_unique[non_unique == 1].index\n",
        "descriptor_set = descriptor_set.drop(cols_to_drop, axis=1)\n",
        "# 2.3. Exporting the actual descriptor_set\n",
        "# descriptor_set.to_csv('descriptor_set.csv')\n",
        "\n",
        "# 3. Create final experimental matrix containing descriptors and results\n",
        "# 3.1. Column descriptors CC_5 for final matrix\n",
        "CC_1 = descriptor_set.columns + '_1'\n",
        "CC_2 = descriptor_set.columns + '_2'\n",
        "CC_3 = ['Result']\n",
        "CC_4 = CC_1.union(CC_2, sort=False)\n",
        "CC_5 = CC_4.union(CC_3, sort=False)\n",
        "\n",
        "# 3.2. Empty array\n",
        "CC_e = []\n",
        "np.random.seed(1)\n",
        "# 3.3. Fill empty array with Component 1, Component 2 descriptors and the result (per experiment)\n",
        "for i in range(len(results)):\n",
        "    A = np.array(descriptor_set.loc[results.iloc[i][0]])\n",
        "    B = np.array(descriptor_set.loc[results.iloc[i][1]])\n",
        "    RS = [A, B]\n",
        "    C = np.concatenate(shuffle(RS), axis=0)\n",
        "    D = [results.iloc[i][-1]]\n",
        "    E = np.concatenate((C, D), axis=0)\n",
        "    CC_e.append(E)\n",
        "# 3.4. Transform array in data frame to obtain final matrix\n",
        "CC = pd.DataFrame(CC_e, columns=CC_5)\n",
        "# 4. Normalize data and apply PCA (as well as t-SNE)\n",
        "# 4.1. Normalize the descriptors\n",
        "x = CC.loc[:, CC_4].values\n",
        "x = StandardScaler().fit_transform(x)\n",
        "sep = int((x.shape[1]*0.5))\n",
        "x1 = (x[:, :sep])\n",
        "x2 = (x[:, sep:])\n",
        "\n",
        "# 4.2. Normalized data has a mean of 0 and standard deviation of 1\n",
        "# print('The mean of the normalized data is {} and the standard deviation is {}.'.format(np.mean(x), np.std(x)))\n",
        "\n",
        "# 4.4. Apply PCA\n",
        "\n",
        "pca_components = 9\n",
        "CC_pca1 = PCA(n_components=pca_components)\n",
        "CC_princomp1 = CC_pca1.fit_transform(x1)\n",
        "CC_principal_df1 = pd.DataFrame(data=CC_princomp1, columns=['PC_1_' + str(i + 1) for i in range(pca_components)])\n",
        "# 4.5. Visualization of higher dimensional space\n",
        "# print('Explained variation per principal component: {}'.format(CC_pca1.explained_variance_ratio_))\n",
        "Lost1 = (100*(1-sum(CC_pca1.explained_variance_ratio_)))\n",
        "nb_des1 = x1.shape[1]\n",
        "nb_com1 = len(CC_pca1.explained_variance_ratio_)\n",
        "print('{}% of the information is lost by reducing {} features to {}.'.format(Lost1, nb_des1, nb_com1))\n",
        "expl = CC_pca1.explained_variance_ratio_\n",
        "#print(expl)\n",
        "loadings = pd.DataFrame(\n",
        "    data=CC_pca1.components_.T * np.sqrt(CC_pca1.explained_variance_),\n",
        "    index = descriptor_set.columns\n",
        ")\n",
        "loadings.to_csv('loadings.csv')\n",
        "\n",
        "\n",
        "CC_pca2 = PCA(n_components=pca_components)\n",
        "CC_princomp2 = CC_pca2.fit_transform(x2)\n",
        "CC_principal_df2 = pd.DataFrame(data=CC_princomp2, columns=['PC_2_' + str(i + 1) for i in range(pca_components)])\n",
        "# 4.5. Visualization of higher dimensional space\n",
        "# print('Explained variation per principal component: {}'.format(CC_pca2.explained_variance_ratio_))\n",
        "Lost2 = (100*(1-sum(CC_pca2.explained_variance_ratio_)))\n",
        "nb_des2 = x2.shape[1]\n",
        "nb_com2 = len(CC_pca2.explained_variance_ratio_)\n",
        "print('{}% of the information is lost by reducing {} features to {}.'.format(Lost2, nb_des2, nb_com2))\n",
        "\n",
        "# 5. Set up machine learning data frame and randomize and separate CC in training and test data\n",
        "# 5.1. Create the final data frame\n",
        "CC_principal_together = pd.concat([CC_principal_df1, CC_principal_df2], axis=1)\n",
        "CC_df = pd.concat([CC_principal_together, CC.iloc[:, -1]], axis=1)\n",
        "\n",
        "# 5.2.Start randomizing\n",
        "# 5.2.1. Set random seed so that shuffle is the same very time\n",
        "np.random.seed(1)\n",
        "# 5.2.2. Put rows into random order: permute index randomly and reindex data frame with that result\n",
        "CC_df = CC_df.reindex(np.random.permutation(CC_df.index))\n",
        "\n",
        "list_mean = list()\n",
        "list_std = list()\n",
        "# Defining the objective function\n",
        "def objective(params, n_folds=5):\n",
        "    # Converting pandas data frame into xgboost format\n",
        "    d_train = xgb.DMatrix(CC_df.iloc[:, :-1], CC_df.iloc[:, -1])\n",
        "\n",
        "    # Running cross validation on your xgboost model\n",
        "    cv_results = xgb.cv(params, d_train, nfold=n_folds, num_boost_round=500,\n",
        "                        early_stopping_rounds=25, metrics='auc', seed=0)\n",
        "    # returns the loss on validation set\n",
        "    auc = max(cv_results['test-auc-mean'])\n",
        "    list_mean.append(auc)\n",
        "    std = cv_results.iloc[-1][-1]\n",
        "    list_std.append(std)\n",
        "    return 1-auc\n",
        "\n",
        "\n",
        "# Defining the search space\n",
        "xgb_space = {\n",
        "    # Do not print warning for not included estimators\n",
        "    'verbosity': 0,\n",
        "    # max_depth: maximum depth allowed for every tree growth\n",
        "    'max_depth': hp.hp.choice('max_depth', np.arange(1, 20, 1, dtype=int)),\n",
        "    # subsample: maximum allowed rows for every tree\n",
        "    'subsample': hp.hp.quniform('subsample', 0.2, 1.0, 0.05),\n",
        "    # n_estimators: number of trees you want to build\n",
        "    'n_estimators': hp.hp.quniform('n_estimators', 2, 20, 1),\n",
        "    # colsample_bytree: maximum allowed features for every tree\n",
        "    'colsample_bytree': hp.hp.quniform('colsample_bytree', 0.5, 1.0, 0.05),\n",
        "    # min_child-weight: minimum number of instances required in each node\n",
        "    'min_child_weight': hp.hp.uniform('min_child_weight', 1, 20),\n",
        "    # reg_alpha: L1 regularisation term on weights\n",
        "    'reg_alpha': hp.hp.uniform('reg_alpha', 0.0, 1.0),\n",
        "    # reg_lambda: L2 regularisation term on weights\n",
        "    'reg_lambda': hp.hp.uniform('reg_lambda', 0.0, 1.0)\n",
        "    }\n",
        "\n",
        "# Storing the results of every iteration\n",
        "trials = Trials()\n",
        "MAX_EVALS = 100\n",
        "\n",
        "# Optimize\n",
        "best = fmin(fn=objective, space=xgb_space, algo=hp.tpe.suggest,\n",
        "            max_evals=MAX_EVALS, trials=trials)\n",
        "print(best)\n",
        "print(max(list_mean), list_std[list_mean.index(max(list_mean))])\n",
        "\n",
        "\n",
        "#\n",
        "#\n",
        "clf = MLPClassifier()\n",
        "clf.fit(CC_df.iloc[80:, 0:-1], CC_df.iloc[80:, -1])\n",
        "predictions = clf.predict(CC_df.iloc[0:79, 0:-1])\n",
        "auc1 = roc_auc_score(CC_df.iloc[0:79, -1], predictions)\n",
        "print(auc1)\n",
        "scores = cross_val_score(clf, CC_df.iloc[:, 0:-1], CC_df.iloc[:, -1], cv=5)\n",
        "mean = (scores[0]+scores[1]+scores[2]+scores[3]+scores[4])/5\n",
        "print(scores,mean)\n",
        "#importance = clf.feature_importances_\n",
        "#print(importance)\n",
        "#\n",
        "# # Last. Model Evaluation metrics\n",
        "# print('Accuracy Score: ' + str(accuracy_score(Paratest_df.iloc[0:36, -1],predictions)))\n",
        "# print('Precision Score: ' + str(precision_score(Paratest_df.iloc[0:36, -1],predictions)))\n",
        "#\n",
        "# # Last.1. Logistic Regression Classifier Confusion matrix\n",
        "# print('Confusion Matrix: \\n' + str(confusion_matrix(Paratest_df.iloc[0:36, -1],predictions)))\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}