{ "cells": [ { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as p\n", "import matplotlib.pyplot as plt\n", "from sklearn import linear_model\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.neural_network import MLPRegressor" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dataset = p.read_csv(\"Downloads/mix.csv\", sep = \";\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | NSC | \n", "smiles | \n", "pKa | \n", "H_ID | \n", "qH | \n", "qO | \n", "qOd | \n", "acid | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "NSC_10232 | \n", "Fc1c(cccc1)OCC(=O)O | \n", "3.08 | \n", "19 | \n", "0.489031 | \n", "-0.739889 | \n", "-0.532442 | \n", "True | \n", "
1 | \n", "NSC_102796 | \n", "c1cc(cc(c1OCCCC(=O)O)C)Cl | \n", "4.84 | \n", "28 | \n", "0.481782 | \n", "-0.737343 | \n", "-0.536813 | \n", "True | \n", "
2 | \n", "NSC_10312 | \n", "Fc1ccc(c(F)c1)C(=O)O | \n", "3.58 | \n", "15 | \n", "0.487173 | \n", "-0.758570 | \n", "-0.517106 | \n", "True | \n", "
3 | \n", "NSC_10319 | \n", "Fc1c(cccc1)C(=O)O | \n", "3.27 | \n", "15 | \n", "0.485349 | \n", "-0.757463 | \n", "-0.517331 | \n", "True | \n", "
4 | \n", "NSC_10320 | \n", "Fc1cc(ccc1)C(=O)O | \n", "3.86 | \n", "15 | \n", "0.486689 | \n", "-0.752247 | \n", "-0.512832 | \n", "True | \n", "
5 | \n", "NSC_10321 | \n", "Fc1ccc(cc1)C(=O)O | \n", "4.14 | \n", "15 | \n", "0.485683 | \n", "-0.754989 | \n", "-0.516351 | \n", "True | \n", "
6 | \n", "NSC_106449 | \n", "C(=O)(O)CCC(=O)O | \n", "4.21 | \n", "9 | \n", "0.481352 | \n", "-0.736415 | \n", "-0.535495 | \n", "True | \n", "
7 | \n", "NSC_1112 | \n", "C1C(C1)C(=O)O | \n", "4.83 | \n", "12 | \n", "0.480072 | \n", "-0.737349 | \n", "-0.530925 | \n", "True | \n", "
8 | \n", "NSC_11765 | \n", "C(CC)(CC)C(=O)O | \n", "4.71 | \n", "20 | \n", "0.479354 | \n", "-0.742006 | \n", "-0.532430 | \n", "True | \n", "
9 | \n", "NSC_120417 | \n", "c1ccccc1[C@@H](c1ccccc1)C(=O)O | \n", "3.94 | \n", "28 | \n", "0.481019 | \n", "-0.736494 | \n", "-0.518805 | \n", "True | \n", "
10 | \n", "NSC_12096 | \n", "c1(c(c(c(cc1C(=O)O)C(=O)O)C(=O)O)C(=O)O)C(=O)O | \n", "1.80 | \n", "27 | \n", "0.494718 | \n", "-0.752332 | \n", "-0.491932 | \n", "True | \n", "
11 | \n", "NSC_125718 | \n", "c1ccccc1CC(=O)O | \n", "4.31 | \n", "18 | \n", "0.480703 | \n", "-0.735679 | \n", "-0.531820 | \n", "True | \n", "
12 | \n", "NSC_126584 | \n", "Fc1cccc(F)c1C(=O)O | \n", "2.85 | \n", "15 | \n", "0.486151 | \n", "-0.719989 | \n", "-0.508648 | \n", "True | \n", "
13 | \n", "NSC_132953 | \n", "C(=O)(O)C | \n", "4.76 | \n", "5 | \n", "0.480616 | \n", "-0.725817 | \n", "-0.543885 | \n", "True | \n", "
14 | \n", "NSC_13564 | \n", "c1cc(cc(c1C(=O)O)O)O | \n", "3.11 | \n", "17 | \n", "0.482454 | \n", "-0.759200 | \n", "-0.519944 | \n", "True | \n", "
15 | \n", "NSC_141 | \n", "BrCC(=O)O | \n", "2.89 | \n", "8 | \n", "0.490306 | \n", "-0.729924 | \n", "-0.526733 | \n", "True | \n", "
16 | \n", "NSC_14190 | \n", "O1[C@@]23[C@@H]4CC[C@]5(C(=C)C[C@]4([C@H]([C@@... | \n", "4.00 | \n", "42 | \n", "0.482679 | \n", "-0.742933 | \n", "-0.524160 | \n", "True | \n", "
17 | \n", "NSC_142 | \n", "ClCC(=O)O | \n", "2.87 | \n", "8 | \n", "0.490754 | \n", "-0.728108 | \n", "-0.524686 | \n", "True | \n", "
18 | \n", "NSC_14285 | \n", "Clc1ccc(cc1)CC(=O)O | \n", "4.19 | \n", "18 | \n", "0.482455 | \n", "-0.734872 | \n", "-0.526215 | \n", "True | \n", "
19 | \n", "NSC_14358 | \n", "Brc1ccc(cc1)CC(=O)O | \n", "4.19 | \n", "18 | \n", "0.483267 | \n", "-0.735217 | \n", "-0.528036 | \n", "True | \n", "
20 | \n", "NSC_147400 | \n", "c1c(c(cc(c1C(=O)O)C)C)C | \n", "4.38 | \n", "21 | \n", "0.481206 | \n", "-0.754803 | \n", "-0.514740 | \n", "True | \n", "
21 | \n", "NSC_149 | \n", "c1c(cccc1)C(=O)O | \n", "4.19 | \n", "15 | \n", "0.483660 | \n", "-0.754053 | \n", "-0.516827 | \n", "True | \n", "
22 | \n", "NSC_15042 | \n", "c1(c(cccc1)C(=O)O)Cl | \n", "2.89 | \n", "15 | \n", "0.485712 | \n", "-0.756333 | \n", "-0.512407 | \n", "True | \n", "
23 | \n", "NSC_151909 | \n", "c1cccc2c1cc1ccccc1c2C(=O)O | \n", "3.65 | \n", "27 | \n", "0.480664 | \n", "-0.723346 | \n", "-0.504408 | \n", "True | \n", "
24 | \n", "NSC_15310 | \n", "c1c(cccc1C(=O)O)C(=O)O | \n", "3.70 | \n", "18 | \n", "0.486608 | \n", "-0.756723 | \n", "-0.514222 | \n", "True | \n", "
25 | \n", "NSC_15772 | \n", "c1cccc2c1c(ccc2)CC(=O)O | \n", "4.23 | \n", "24 | \n", "0.481026 | \n", "-0.737732 | \n", "-0.528844 | \n", "True | \n", "
26 | \n", "NSC_15797 | \n", "O=Cc1ccc(cc1)C(=O)O | \n", "3.77 | \n", "16 | \n", "0.487109 | \n", "-0.752144 | \n", "-0.509275 | \n", "True | \n", "
27 | \n", "NSC_16045 | \n", "C(C)C(C)(C)C(=O)O | \n", "5.03 | \n", "20 | \n", "0.479349 | \n", "-0.746686 | \n", "-0.539767 | \n", "True | \n", "
28 | \n", "NSC_166 | \n", "OCC(=O)O | \n", "3.83 | \n", "9 | \n", "0.486285 | \n", "-0.737352 | \n", "-0.541080 | \n", "True | \n", "
29 | \n", "NSC_16631 | \n", "c1(c(ccc(c1)C(=O)O)O)O | \n", "4.26 | \n", "15 | \n", "0.484390 | \n", "-0.755481 | \n", "-0.514857 | \n", "True | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
341 | \n", "NSC_8130 | \n", "c1(cc(cc(c1O)C)C)C(C)(C)C | \n", "12.04 | \n", "16 | \n", "0.463194 | \n", "-0.776039 | \n", "-0.487273 | \n", "False | \n", "
342 | \n", "NSC_8204 | \n", "c1cccc(c1C(=O)OC)O | \n", "9.87 | \n", "19 | \n", "0.463725 | \n", "-0.727542 | \n", "-0.425201 | \n", "False | \n", "
343 | \n", "NSC_82996 | \n", "O=Cc1ccc(c(c1)O)OC | \n", "8.89 | \n", "19 | \n", "0.460785 | \n", "-0.757707 | \n", "-0.469153 | \n", "False | \n", "
344 | \n", "NSC_8464 | \n", "Clc1cc(ccc1O)C(C)(C)C | \n", "8.58 | \n", "16 | \n", "0.462662 | \n", "-0.753492 | \n", "-0.452375 | \n", "False | \n", "
345 | \n", "NSC_8475 | \n", "c1cc(ccc1C(=O)OCCCC)O | \n", "8.47 | \n", "19 | \n", "0.463051 | \n", "-0.762809 | \n", "-0.444693 | \n", "False | \n", "
346 | \n", "NSC_8477 | \n", "c1c(ccc(c1C)O)C(C)(C)C | \n", "10.59 | \n", "19 | \n", "0.456479 | \n", "-0.771014 | \n", "-0.482809 | \n", "False | \n", "
347 | \n", "NSC_8510 | \n", "c1cc(ccc1C(=O)OCC)O | \n", "8.34 | \n", "17 | \n", "0.463171 | \n", "-0.762557 | \n", "-0.444474 | \n", "False | \n", "
348 | \n", "NSC_8511 | \n", "c1cc(ccc1C(=O)OCCC)O | \n", "7.91 | \n", "18 | \n", "0.463091 | \n", "-0.762669 | \n", "-0.444615 | \n", "False | \n", "
349 | \n", "NSC_85228 | \n", "C(C)O | \n", "15.90 | \n", "9 | \n", "0.434784 | \n", "-0.740200 | \n", "-0.656838 | \n", "False | \n", "
350 | \n", "NSC_85232 | \n", "CO | \n", "15.30 | \n", "6 | \n", "0.434178 | \n", "-0.733705 | \n", "-0.661997 | \n", "False | \n", "
351 | \n", "NSC_85475 | \n", "c1(cc(c(cc1)O)CO)C | \n", "10.15 | \n", "19 | \n", "0.456658 | \n", "-0.773334 | \n", "-0.485838 | \n", "False | \n", "
352 | \n", "NSC_87078 | \n", "Fc1cc(ccc1)O | \n", "9.21 | \n", "13 | \n", "0.462378 | \n", "-0.764281 | \n", "-0.475697 | \n", "False | \n", "
353 | \n", "NSC_8768 | \n", "c1c(cccc1O)C | \n", "10.09 | \n", "16 | \n", "0.457939 | \n", "-0.768640 | \n", "-0.483908 | \n", "False | \n", "
354 | \n", "NSC_88303 | \n", "c1c(ccc(c1)O)C(F)(F)F | \n", "8.68 | \n", "16 | \n", "0.463668 | \n", "-0.762624 | \n", "-0.461167 | \n", "False | \n", "
355 | \n", "NSC_8837 | \n", "C(C)OCCO | \n", "14.80 | \n", "16 | \n", "0.435828 | \n", "-0.734773 | \n", "-0.644280 | \n", "False | \n", "
356 | \n", "NSC_8873 | \n", "c1c(cccc1O)CC | \n", "9.90 | \n", "19 | \n", "0.457912 | \n", "-0.768762 | \n", "-0.483552 | \n", "False | \n", "
357 | \n", "NSC_8885 | \n", "c1c(cc(cc1O)C)CC | \n", "10.10 | \n", "22 | \n", "0.457785 | \n", "-0.770413 | \n", "-0.483136 | \n", "False | \n", "
358 | \n", "NSC_8895 | \n", "c1c(c(ccc1CC=C)O)OC | \n", "10.19 | \n", "19 | \n", "0.457462 | \n", "-0.760319 | \n", "-0.468112 | \n", "False | \n", "
359 | \n", "NSC_91527 | \n", "c1(ccc(cc1)[C@@H](c1ccc(cc1)O)c1ccccc1CO)O | \n", "9.65 | \n", "39 | \n", "0.458003 | \n", "-0.768359 | \n", "-0.471529 | \n", "False | \n", "
360 | \n", "NSC_9230 | \n", "OCC(O)CO | \n", "14.40 | \n", "7 | \n", "0.437357 | \n", "-0.732922 | \n", "-0.635230 | \n", "False | \n", "
361 | \n", "NSC_9247 | \n", "c1c(ccc(c1)O)O | \n", "10.85 | \n", "13 | \n", "0.456512 | \n", "-0.768012 | \n", "-0.483003 | \n", "False | \n", "
362 | \n", "NSC_9268 | \n", "c1c(cc(cc1C)C)O | \n", "10.19 | \n", "19 | \n", "0.457821 | \n", "-0.770308 | \n", "-0.483535 | \n", "False | \n", "
363 | \n", "NSC_93876 | \n", "OCCO | \n", "15.10 | \n", "5 | \n", "0.436004 | \n", "-0.733223 | \n", "-0.644783 | \n", "False | \n", "
364 | \n", "NSC_9586 | \n", "c1(c2ccccc2ccc1)O | \n", "9.34 | \n", "19 | \n", "0.461046 | \n", "-0.767400 | \n", "-0.441500 | \n", "False | \n", "
365 | \n", "NSC_96336 | \n", "C(F)(F)(F)C(O)C(F)(F)F | \n", "9.30 | \n", "12 | \n", "0.471990 | \n", "-0.722442 | \n", "-0.591205 | \n", "False | \n", "
366 | \n", "NSC_9775 | \n", "C1c2ccc(cc2CC1)O | \n", "10.32 | \n", "20 | \n", "0.456841 | \n", "-0.770016 | \n", "-0.482531 | \n", "False | \n", "
367 | \n", "NSC_98355 | \n", "c1c(ccc(c1C)O)C(C)(C)C | \n", "10.59 | \n", "19 | \n", "0.456479 | \n", "-0.771014 | \n", "-0.482809 | \n", "False | \n", "
368 | \n", "NSC_9884 | \n", "c1c(cc(cc1)O)C(F)(F)F | \n", "8.95 | \n", "16 | \n", "0.462942 | \n", "-0.762941 | \n", "-0.472215 | \n", "False | \n", "
369 | \n", "NSC_9885 | \n", "c1c(ccc(c1)O)OCC | \n", "10.13 | \n", "15 | \n", "0.455104 | \n", "-0.769984 | \n", "-0.490652 | \n", "False | \n", "
370 | \n", "NSC_9887 | \n", "c1(c(c(c(c(c1)Cl)O)Cc1c(c(cc(c1O)Cl)Cl)Cl)Cl)Cl | \n", "4.95 | \n", "27 | \n", "0.467314 | \n", "-0.747633 | \n", "-0.402338 | \n", "False | \n", "
371 rows × 8 columns
\n", "\n", " | pKa | \n", "H_ID | \n", "qH | \n", "qO | \n", "qOd | \n", "
---|---|---|---|---|---|
count | \n", "371.000000 | \n", "371.000000 | \n", "371.000000 | \n", "371.000000 | \n", "371.000000 | \n", "
mean | \n", "6.503881 | \n", "17.859838 | \n", "0.473013 | \n", "-0.750591 | \n", "-0.509098 | \n", "
std | \n", "3.604279 | \n", "7.807258 | \n", "0.014860 | \n", "0.014849 | \n", "0.048126 | \n", "
min | \n", "0.510000 | \n", "5.000000 | \n", "0.429287 | \n", "-0.786837 | \n", "-0.661997 | \n", "
25% | \n", "3.700000 | \n", "14.000000 | \n", "0.460489 | \n", "-0.762620 | \n", "-0.531205 | \n", "
50% | \n", "4.800000 | \n", "17.000000 | \n", "0.478824 | \n", "-0.753064 | \n", "-0.513323 | \n", "
75% | \n", "9.700000 | \n", "20.000000 | \n", "0.484448 | \n", "-0.738481 | \n", "-0.482494 | \n", "
max | \n", "17.600000 | \n", "82.000000 | \n", "0.500662 | \n", "-0.709112 | \n", "-0.380861 | \n", "