Module propfit.propfit
Classes
class PropFit (filename=None,
props=['Gh', 'Hh', 'Cph', 'V', 'Hig', 'Sig', 'Cpig'],
group_file=None)-
Expand source code
class PropFit(Estimate): """ Regress thermodynamic properties of molecular groups for use in estimation of compounds with no experimental data. Parameters ---------- filename : str Name of csv with compound names and thermodynamic properties you want to regress. props : list of strings Thermodynamic properties you want to regress group data for. Must match the property columns in the input file. """ def __init__(self, filename=None, props=['Gh','Hh','Cph','V','Hig','Sig','Cpig'], group_file=None): if isinstance(filename, str): self.input_df = pd.read_csv(filename) else: with import_package_file(__name__, 'default databases/default database.csv', as_file=True) as path: self.input_df = pd.read_csv(path) if isinstance(group_file, str): self.group_df = pd.read_csv(group_file) else: self.group_df = None self.props = props self.smiles = None self.err_handler = Error_Handler(clean=False) def dataprep(self, average=True, order=2, output_name = None): """ Prepare a dataframe that consists of molecule names, properties, and group match data. Parameters ---------- average : bool, default True Average repeat measurements for compound properties? order : numeric or str, default 2 Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'. output_name : str, default 'properties and groups.csv' Name of the CSV file that will be generated. """ input_df = self.input_df smile_df = input_df.copy() if output_name == None: output_name = 'properties and groups.csv' else: output_name = output_name for c in list(input_df.columns)[1:]: if c not in self.props: input_df.drop(c, axis=1, inplace=True) if average==True: new_df = pd.DataFrame(columns = input_df.columns) for i in input_df.index: c = input_df.loc[i, 'compound'] if c not in list(new_df.compound): new_df.loc[i, 'compound'] = c for p in self.props: temp_df = input_df.loc[input_df['compound'] == c].loc[~input_df[p].isnull()] if len(temp_df)>0: new_df.loc[i, p] = np.nanmean(temp_df[p]) input_df = new_df if isinstance(self.group_df, pd.DataFrame): group_df = self.group_df elif order == 1 or order == '1': with import_package_file(__name__, 'default databases/1st order groups.csv', as_file=True) as path: group_df = pd.read_csv(path) elif order == 2 or order == '2': with import_package_file(__name__, 'default databases/2nd order groups.csv', as_file=True) as path: group_df = pd.read_csv(path) group_df.replace(np.nan, '', inplace=True) self.group_df = group_df keys = list(group_df['keys']) values = list(group_df['values']) pattern_dict = dict(zip(keys, values)) for key in pattern_dict: if str(pattern_dict[key]) == 'nan': pattern_dict[key] = '' self.pattern_dict = pattern_dict keys += ['formula'] df = pd.DataFrame(columns = ['compound']+keys) vetted_mol = [] ind = 0 molecules = list(input_df["compound"]) for molecule in molecules: if molecule not in vetted_mol: self.name = molecule temp_smile_df = smile_df.loc[smile_df['compound']==molecule].loc[~smile_df['SMILES'].isnull()].copy() if len(temp_smile_df)>0: self.smiles = temp_smile_df['SMILES'].values[0] else: self.pcp_compound = pcp.get_compounds(self.name, "name") self.smiles = self.pcp_compound[0].connectivity_smiles self.get_mol_smiles_formula_formula_dict() temp_dict = self.match_groups() values = list(temp_dict.values()) temp_df = pd.DataFrame(columns = ['compound']+keys) temp_df.loc[0, 'compound'] = molecule temp_df.loc[0, keys] = values df = pd.concat([df, temp_df]) vetted_mol.append(molecule) failures = len(vetted_mol) - len(molecules) if failures > 0: print('There were '+str(failures)+' molecules that did not work') print([m for m in molecules if m not in vetted_mol]) ### now add props key_df = pd.DataFrame(columns = keys) for k in keys: key_df[k] = [np.nan]*len(input_df) prop_df = pd.concat([input_df, key_df], axis=1) prop_df['formula'] = prop_df['formula'].astype(str) prop_df = prop_df.loc[~prop_df['compound'].isnull()] for i in prop_df.index: compound = prop_df.loc[i, 'compound'] values = list(df.loc[df['compound'] == compound][keys].values[0]) prop_df.loc[i, keys] = values prop_df.to_csv(output_name, index=False) print(output_name + ' created') def group_property_estimator(self, filename, props=None, ignore = None): """ Regress the properties of group matches based on thermodynamic properties of molecules and molecular group matches. Parameters ---------- filename : str Name of the file containing compounds, properties, and group matching data. props : list of strings Thermodynamic properties you want to regress group data for. Must match the property columns in the input file. ignore : list of strings Name of columns in the input file which are not thermodynamic properties and therefore should not be regressed. """ if not isinstance(props, list): props=self.props df1 = pd.read_csv(filename) if isinstance(ignore, list): df1.drop(ignore, axis=1, inplace=True) bad_props = [] for p in props: if p not in list(df1.columns): bad_props.append(p) props = [p for p in props if p not in bad_props] for dependent_param in props: df_data = df1.copy() # remove columns containing 0 groups df_data = df_data.loc[:, (df_data != 0).any(axis=0)] # get data subset that needs a prediction df_topred = df_data[np.isfinite(df_data[dependent_param]) == False] # get data subset that does not need a prediction df_not_topred = df_data[np.isfinite(df_data[dependent_param]) == True] # delete rows representing compounds that need a prediction when they contain a group that is not # represented in the training set. delrows = [] for col in df_not_topred.columns.values: try: if sum(df_not_topred[col]) == 0: for row in df_topred.index.values: if df_topred.loc[row, col] != 0: delrows.append(row) except: pass bad_df = df_topred.index.isin(delrows) df_topred = df_topred[~bad_df] # remove rows without y values df_data = df_data[np.isfinite(df_data[dependent_param])] X = df_data[[x for x in list(df_data.columns.values) if not x in ["compound", "formula", dependent_param]+props]].copy() y = df_data[dependent_param].copy() X_topred = df_topred[[x for x in list(df_topred.columns.values) if not x in ["compound", "formula", dependent_param]+props]].copy() if 'ig' not in dependent_param: ## add material point X.loc[:, "material point"] = [1]* len(X) X_topred.loc[:, "material point"] = [1]* len(X_topred) multi_reg = sm.OLS(y[0:], X[0:]).fit() # perform the multiple regression prediction = multi_reg.predict(X) # make the predictions from the multi_reg preds = multi_reg.predict(X_topred) material_point = 0 material_point_err = 0 group_property_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.params.values])) group_property_se_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.bse.values])) pred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X.loc[idx], X.columns.values)])**0.5 for idx in X.index] topred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X_topred.loc[idx], X_topred.columns.values)])**0.5 for idx in X_topred.index] comp_pred_df = pd.DataFrame({"compound":list(df_data["compound"]), "actual":df_data[dependent_param], "prediction":[round(pred+material_point, 2) for pred in prediction.values], "pred errs":[round(err, 2) for err in pred_errs]}) df_preds = pd.DataFrame({"compound":list(df_topred["compound"]), "actual":df_topred[dependent_param], "prediction":[round(pred+material_point, 2) for pred in preds.values], "pred errs":[round(err, 2) for err in topred_errs]}) df_final = pd.concat([comp_pred_df,df_preds]) #is this happening on the right axis? with pd.option_context('display.max_rows', None, 'display.max_columns', None): df_group_property = pd.DataFrame(group_property_dict.items(), columns=['group', 'value']) df_group_se = pd.DataFrame(group_property_se_dict.items(), columns=['group', 'std err']) save_as = filename.split('.csv')[0]+' regressed' df_final.to_csv(save_as+"_"+dependent_param+".csv", index=False) df_group_property.to_csv(save_as+"_"+dependent_param+"_group_property.csv", index=False) #reports 0s when not able to estimate df_group_se.to_csv(save_as+"_"+dependent_param+"_group_se.csv", index=False) def generate(self, filename = 'properties and groups regressed', order=2, hyd_props = ['Gh','Hh','Cph','V'], gas_props = ['Hig','Sig','Cpig']): """ Generate the group thermodynamic property databases needed to put into AqOrg's Estimate() function. Parameters ---------- filename : str, default 'properties and groups regressed' Common start to filenames that were generated in the group_property_estimaor() function. order : numeric or str, default 2 Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'. This has to match the order which was used to make the properties and groups dataframe. hyd_props : list of strings, default ['Gh','Hh','Cph','V'] Thermodynamic properties of hydration you want to regress group data for. Must match the property columns in the input file. gas_props : list of strings, default ['Hig','Sig','Cpig'] Thermodynamic properties of formation of ideal gases you want to regress group data for. Must match the property columns in the input file. """ save_as = filename hyd_cols = [] for h in hyd_props: hyd_cols += [h, h+'_err', h+'_n'] gas_cols = [] for g in gas_props: gas_cols += [g, g+'_err', g+'_n'] group_df = self.group_df keys = list(group_df['keys']) values = list(group_df['values']) pattern_dict = dict(zip(keys, values)) for key in pattern_dict: if str(pattern_dict[key]) == 'nan': pattern_dict[key] = '' self.pattern_dict = pattern_dict hyd = pd.DataFrame(columns = ['group']+hyd_cols+['smarts','elem']) temp_df = pd.read_csv(save_as+'_'+hyd_props[0]+'_group_property.csv') for i in temp_df.index: group = temp_df.loc[i, 'group'] hyd.loc[i, 'group'] = group hyd.loc[i, 'smarts'] = group if group != 'material point': hyd.loc[i, 'elem'] = self.pattern_dict[group] group_df = pd.read_csv(save_as.split(' regressed')[0]+'.csv') ind = max([list(group_df.columns).index(p) for p in self.props])+1 end = list(group_df.columns).index('formula') groups = list(group_df.columns)[ind:end] remove = [] for p in hyd_props: prop_df = pd.read_csv(save_as+'_'+p+'_group_property.csv') err_df = pd.read_csv(save_as+'_'+p+'_group_se.csv') for i in prop_df.index: group = prop_df.loc[i, 'group'] value = prop_df.loc[i, 'value'] if err_df.loc[i, 'group'] != group: print('err') if value == 0 and i not in remove: remove.append(i) if value != 0: err = err_df.loc[i, 'std err'] cnt = 0 if group != 'material point': cnt = group_df.loc[group_df[group]>0][p].count() hyd.loc[i, p] = value hyd.loc[i, p+'_err'] = err hyd.loc[i, p+'_n'] = cnt hyd.drop(remove, inplace=True) gas = pd.DataFrame(columns = ['group']+gas_cols+['smarts','elem']) temp_df = pd.read_csv(save_as+'_'+gas_props[0]+'_group_property.csv') for i in temp_df.index: group = temp_df.loc[i, 'group'] gas.loc[i, 'group'] = group gas.loc[i, 'smarts'] = group if group != 'material point': gas.loc[i, 'elem'] = self.pattern_dict[group] remove = [] for p in gas_props: prop_df = pd.read_csv(save_as+'_'+p+'_group_property.csv') err_df = pd.read_csv(save_as+'_'+p+'_group_se.csv') for i in prop_df.index: group = prop_df.loc[i, 'group'] value = prop_df.loc[i, 'value'] if err_df.loc[i, 'group'] != group: print('err') if value == 0 and i not in remove: remove.append(i) if value != 0: err = err_df.loc[i, 'std err'] cnt = 0 if group != 'material point': cnt = group_df.loc[group_df[group]>0][p].count() gas.loc[i, p] = value gas.loc[i, p+'_err'] = err gas.loc[i, p+'_n'] = cnt gas.drop(remove, inplace=True) gas.insert(1, 'Gig', np.nan) for i in gas.index: elements = gas.loc[i, 'elem'] if str(elements) not in ['nan', '']: reset(messages=False) Selem = entropy(elements)*4.184 Sig = gas.loc[i, 'Sig'] dS = Sig - Selem gas.loc[i, 'Gig'] = round((gas.loc[i, 'Hig']*1000 - 298.15*dS)/1000, 3) gas_ind = max(gas.index)+1 gas.loc[gas_ind, :] = 0 gas.loc[gas_ind, 'group']='Yo' gas.loc[gas_ind, 'smarts']='Yo' hyd_ind = hyd.loc[hyd['group']=='material point'].index[0] hyd.loc[hyd_ind, 'group']='Yo' hyd.loc[hyd_ind, 'smarts']='Yo' gas.to_csv('gas props.csv', index=False) hyd.to_csv('hyd props.csv', index=False) def tts(self, repeats = 100, test_size = 0.2, filename = None, output_name = 'stats df.csv', show=True): """ Perform a train-test split on the thermodynamic data you will regress and estimate properties from. Parameters ---------- repeats: int, default 100 Number of semi-random iterations of train-test splitting to do. The iterations will be used to calculate statistics associated with the semi-random sampling process. test_size: float, default 0.2 Fraction of the database for each thermodynamic property that should be used as the test set. filename : str, default properties and groups.csv Common start to filenames that were generated in the group_property_estimaor() function. output_name : str, default 'stats df.csv' Name of the CSV file that will be generated, composed of the average and standard deviation of the RMSEs of estimations associated with each iterations. show : bool, default True Show barplot for training and test data for each property? """ if filename == None: filename = 'properties and groups.csv' df = pd.read_csv(filename) stats_df = pd.DataFrame() for prop_ind, p in enumerate(self.props): temp_props = self.props.copy() temp_props.remove(p) prop_df = df.loc[~df[p].isnull()].drop(temp_props, axis=1) prop_df.reset_index(inplace=True, drop=True) for c in prop_df.columns[2:-1]: if sum(prop_df[c]) == 0: prop_df.drop(c, axis=1, inplace=True) groups = list(prop_df.columns)[2:-1] data_len = len(prop_df) prop = prop_df.columns[1] compounds = [] for c in list(prop_df['compound']): if c not in compounds: compounds.append(c) ### sorting by least represented groups to make sampling easier below lengths = [] for g in groups: length = len(prop_df.loc[prop_df[g]>0]) #how many times is the group represented lengths.append(length) length_df = pd.DataFrame() length_df['groups'] = groups length_df['lengths'] = lengths length_df.sort_values(by = 'lengths', inplace=True) #least common groups first groups = list(length_df['groups']) ### choosing one rep per group reps = [] for g in groups: mols_w_data = [] temp_df = prop_df.loc[prop_df[g]>0].loc[~prop_df['compound'].isin(reps)] if len(temp_df)>0: for c in temp_df.compound: if c not in mols_w_data: mols_w_data.append(c) reps.append(random.sample(mols_w_data, 1)[0]) non_reps = [c for c in compounds if c not in reps] train = reps.copy() test = non_reps.copy() test_df = prop_df.loc[prop_df['compound'].isin(test)] train_df = prop_df.loc[prop_df['compound'].isin(train)] test_df.to_csv(p+'_test.csv', index=False) train_df.to_csv(p+'_train.csv', index=False) train_RMSEs = [] test_RMSEs = [] for r in range(0, repeats): test_df = pd.read_csv(p+'_test.csv') train_df = pd.read_csv(p+'_train.csv') ## bare min training set so all groups are represented total_df = pd.concat([train_df, test_df], ignore_index=True) compounds = [] test = [] train = [] for j in total_df.index: compound = total_df.loc[j, 'compound'] if compound not in compounds: compounds.append(compound) if compound in list(test_df['compound']) and compound not in test: test.append(compound) if compound in list(train_df['compound']) and compound not in train: train.append(compound) ## randomly harvesting more data for training set for i in range(0, len(test)): if len(total_df.loc[total_df['compound'].isin(test)])/len(total_df) > test_size: rand = random.sample(test, 1)[0] train.append(rand) test.remove(rand) test_df = total_df.loc[total_df['compound'].isin(test)] for i in test_df.index: test_df.loc[i, prop] = np.nan train_df = total_df.loc[total_df['compound'].isin(train)] total_df = pd.concat([train_df, test_df]) file_name = 'split_'+p+'_test.csv' total_df.to_csv(file_name, index=False) save_as = file_name.split('.csv')[0]+' regressed' sig_figs = 3 fixed_material_point = False estimate_material_point = True self.group_property_estimator(file_name, [p]) SEs=[] df3 = pd.read_csv(save_as+'_'+p+'.csv') df3 = df3.loc[~df3['actual'].isnull()] for c in train: temp_df=df3.loc[df3['compound']==c] actual=np.average(temp_df['actual'].values) pred = temp_df['prediction'].values[0] SEs.append((pred-actual)**2) train_RMSE = np.sqrt(np.nanmean(SEs)) train_RMSEs.append(train_RMSE) df3 = pd.read_csv('split_'+p+'_test regressed_'+p+'.csv') df3 = df3.loc[df3['actual'].isnull()].loc[~df3['prediction'].isnull()] comps = [] for k in df3.index: c = df3.loc[k, 'compound'] if c not in comps: comps.append(c) SEs = [] for c in comps: pred = df3.loc[df3['compound'] == c]['prediction'].values[0] actual = np.nanmean(df.loc[df['compound'] == c][p]) SEs.append((pred-actual)**2) test_RMSE = np.sqrt(np.nanmean(SEs)) test_RMSEs.append(test_RMSE) dataset1 = [] dataset2 = [] stats_df.loc[prop_ind, 'property'] = p dataset1.append(np.array(train_RMSEs)) dataset2.append(np.array(test_RMSEs)) avg1 = np.nanmean(train_RMSEs) avg2 = np.nanmean(test_RMSEs) stdev1 = np.std(train_RMSEs) stdev2 = np.std(test_RMSEs) stats_df.loc[prop_ind, 'train avg'] = avg1 stats_df.loc[prop_ind, 'train stdev'] = stdev1 stats_df.loc[prop_ind, 'test avg'] = avg2 stats_df.loc[prop_ind, 'test stdev'] = stdev2 stats_df.to_csv(output_name, index=False) if show == True: colors = ['lightblue','orange','red','blue','sienna','pink','gold','navy','black'] fig, ax = plt.subplots(1, len(self.props), figsize=(len(self.props)*2, 2)) for n, p in enumerate(self.props): test_val = stats_df.loc[n, 'test avg'] train_val = stats_df.loc[n, 'train avg'] test_err = stats_df.loc[n, 'test stdev'] train_err = stats_df.loc[n, 'train stdev'] ax[n].bar(0, train_val, yerr = train_err, capsize=5, color=colors[n], label='train', edgecolor='black') ax[n].bar(1, test_val, yerr = test_err, capsize=5, color=colors[n], label='test', edgecolor='black', alpha=0.5, hatch='//') ax[n].set_title(self.props[n]) ax[n].set_xticks([0, 1], ['train', 'test']) fig.suptitle('Train test splitting thermodynamic properties', y=1.1) ax[0].set_ylabel('RMSE') fig.subplots_adjust(wspace=0.3) plt.savefig('train test split barplots.pdf', bbox_inches='tight')Regress thermodynamic properties of molecular groups for use in estimation of compounds with no experimental data.
Parameters
filename:str- Name of csv with compound names and thermodynamic properties you want to regress.
props:listofstrings- Thermodynamic properties you want to regress group data for. Must match the property columns in the input file.
Ancestors
- aqorg.AqOrg.Estimate
Methods
def dataprep(self, average=True, order=2, output_name=None)-
Expand source code
def dataprep(self, average=True, order=2, output_name = None): """ Prepare a dataframe that consists of molecule names, properties, and group match data. Parameters ---------- average : bool, default True Average repeat measurements for compound properties? order : numeric or str, default 2 Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'. output_name : str, default 'properties and groups.csv' Name of the CSV file that will be generated. """ input_df = self.input_df smile_df = input_df.copy() if output_name == None: output_name = 'properties and groups.csv' else: output_name = output_name for c in list(input_df.columns)[1:]: if c not in self.props: input_df.drop(c, axis=1, inplace=True) if average==True: new_df = pd.DataFrame(columns = input_df.columns) for i in input_df.index: c = input_df.loc[i, 'compound'] if c not in list(new_df.compound): new_df.loc[i, 'compound'] = c for p in self.props: temp_df = input_df.loc[input_df['compound'] == c].loc[~input_df[p].isnull()] if len(temp_df)>0: new_df.loc[i, p] = np.nanmean(temp_df[p]) input_df = new_df if isinstance(self.group_df, pd.DataFrame): group_df = self.group_df elif order == 1 or order == '1': with import_package_file(__name__, 'default databases/1st order groups.csv', as_file=True) as path: group_df = pd.read_csv(path) elif order == 2 or order == '2': with import_package_file(__name__, 'default databases/2nd order groups.csv', as_file=True) as path: group_df = pd.read_csv(path) group_df.replace(np.nan, '', inplace=True) self.group_df = group_df keys = list(group_df['keys']) values = list(group_df['values']) pattern_dict = dict(zip(keys, values)) for key in pattern_dict: if str(pattern_dict[key]) == 'nan': pattern_dict[key] = '' self.pattern_dict = pattern_dict keys += ['formula'] df = pd.DataFrame(columns = ['compound']+keys) vetted_mol = [] ind = 0 molecules = list(input_df["compound"]) for molecule in molecules: if molecule not in vetted_mol: self.name = molecule temp_smile_df = smile_df.loc[smile_df['compound']==molecule].loc[~smile_df['SMILES'].isnull()].copy() if len(temp_smile_df)>0: self.smiles = temp_smile_df['SMILES'].values[0] else: self.pcp_compound = pcp.get_compounds(self.name, "name") self.smiles = self.pcp_compound[0].connectivity_smiles self.get_mol_smiles_formula_formula_dict() temp_dict = self.match_groups() values = list(temp_dict.values()) temp_df = pd.DataFrame(columns = ['compound']+keys) temp_df.loc[0, 'compound'] = molecule temp_df.loc[0, keys] = values df = pd.concat([df, temp_df]) vetted_mol.append(molecule) failures = len(vetted_mol) - len(molecules) if failures > 0: print('There were '+str(failures)+' molecules that did not work') print([m for m in molecules if m not in vetted_mol]) ### now add props key_df = pd.DataFrame(columns = keys) for k in keys: key_df[k] = [np.nan]*len(input_df) prop_df = pd.concat([input_df, key_df], axis=1) prop_df['formula'] = prop_df['formula'].astype(str) prop_df = prop_df.loc[~prop_df['compound'].isnull()] for i in prop_df.index: compound = prop_df.loc[i, 'compound'] values = list(df.loc[df['compound'] == compound][keys].values[0]) prop_df.loc[i, keys] = values prop_df.to_csv(output_name, index=False) print(output_name + ' created')Prepare a dataframe that consists of molecule names, properties, and group match data.
Parameters
average:bool, defaultTrue- Average repeat measurements for compound properties?
order:numericorstr, default2- Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'.
output_name:str, default'properties and groups.csv'- Name of the CSV file that will be generated.
def generate(self,
filename='properties and groups regressed',
order=2,
hyd_props=['Gh', 'Hh', 'Cph', 'V'],
gas_props=['Hig', 'Sig', 'Cpig'])-
Expand source code
def generate(self, filename = 'properties and groups regressed', order=2, hyd_props = ['Gh','Hh','Cph','V'], gas_props = ['Hig','Sig','Cpig']): """ Generate the group thermodynamic property databases needed to put into AqOrg's Estimate() function. Parameters ---------- filename : str, default 'properties and groups regressed' Common start to filenames that were generated in the group_property_estimaor() function. order : numeric or str, default 2 Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'. This has to match the order which was used to make the properties and groups dataframe. hyd_props : list of strings, default ['Gh','Hh','Cph','V'] Thermodynamic properties of hydration you want to regress group data for. Must match the property columns in the input file. gas_props : list of strings, default ['Hig','Sig','Cpig'] Thermodynamic properties of formation of ideal gases you want to regress group data for. Must match the property columns in the input file. """ save_as = filename hyd_cols = [] for h in hyd_props: hyd_cols += [h, h+'_err', h+'_n'] gas_cols = [] for g in gas_props: gas_cols += [g, g+'_err', g+'_n'] group_df = self.group_df keys = list(group_df['keys']) values = list(group_df['values']) pattern_dict = dict(zip(keys, values)) for key in pattern_dict: if str(pattern_dict[key]) == 'nan': pattern_dict[key] = '' self.pattern_dict = pattern_dict hyd = pd.DataFrame(columns = ['group']+hyd_cols+['smarts','elem']) temp_df = pd.read_csv(save_as+'_'+hyd_props[0]+'_group_property.csv') for i in temp_df.index: group = temp_df.loc[i, 'group'] hyd.loc[i, 'group'] = group hyd.loc[i, 'smarts'] = group if group != 'material point': hyd.loc[i, 'elem'] = self.pattern_dict[group] group_df = pd.read_csv(save_as.split(' regressed')[0]+'.csv') ind = max([list(group_df.columns).index(p) for p in self.props])+1 end = list(group_df.columns).index('formula') groups = list(group_df.columns)[ind:end] remove = [] for p in hyd_props: prop_df = pd.read_csv(save_as+'_'+p+'_group_property.csv') err_df = pd.read_csv(save_as+'_'+p+'_group_se.csv') for i in prop_df.index: group = prop_df.loc[i, 'group'] value = prop_df.loc[i, 'value'] if err_df.loc[i, 'group'] != group: print('err') if value == 0 and i not in remove: remove.append(i) if value != 0: err = err_df.loc[i, 'std err'] cnt = 0 if group != 'material point': cnt = group_df.loc[group_df[group]>0][p].count() hyd.loc[i, p] = value hyd.loc[i, p+'_err'] = err hyd.loc[i, p+'_n'] = cnt hyd.drop(remove, inplace=True) gas = pd.DataFrame(columns = ['group']+gas_cols+['smarts','elem']) temp_df = pd.read_csv(save_as+'_'+gas_props[0]+'_group_property.csv') for i in temp_df.index: group = temp_df.loc[i, 'group'] gas.loc[i, 'group'] = group gas.loc[i, 'smarts'] = group if group != 'material point': gas.loc[i, 'elem'] = self.pattern_dict[group] remove = [] for p in gas_props: prop_df = pd.read_csv(save_as+'_'+p+'_group_property.csv') err_df = pd.read_csv(save_as+'_'+p+'_group_se.csv') for i in prop_df.index: group = prop_df.loc[i, 'group'] value = prop_df.loc[i, 'value'] if err_df.loc[i, 'group'] != group: print('err') if value == 0 and i not in remove: remove.append(i) if value != 0: err = err_df.loc[i, 'std err'] cnt = 0 if group != 'material point': cnt = group_df.loc[group_df[group]>0][p].count() gas.loc[i, p] = value gas.loc[i, p+'_err'] = err gas.loc[i, p+'_n'] = cnt gas.drop(remove, inplace=True) gas.insert(1, 'Gig', np.nan) for i in gas.index: elements = gas.loc[i, 'elem'] if str(elements) not in ['nan', '']: reset(messages=False) Selem = entropy(elements)*4.184 Sig = gas.loc[i, 'Sig'] dS = Sig - Selem gas.loc[i, 'Gig'] = round((gas.loc[i, 'Hig']*1000 - 298.15*dS)/1000, 3) gas_ind = max(gas.index)+1 gas.loc[gas_ind, :] = 0 gas.loc[gas_ind, 'group']='Yo' gas.loc[gas_ind, 'smarts']='Yo' hyd_ind = hyd.loc[hyd['group']=='material point'].index[0] hyd.loc[hyd_ind, 'group']='Yo' hyd.loc[hyd_ind, 'smarts']='Yo' gas.to_csv('gas props.csv', index=False) hyd.to_csv('hyd props.csv', index=False)Generate the group thermodynamic property databases needed to put into AqOrg's Estimate() function.
Parameters
filename:str, default'properties and groups regressed'- Common start to filenames that were generated in the group_property_estimaor() function.
order:numericorstr, default2- Order of approximation for splitting molecules into groups. Accepts "2", 2, "1", 1, or 'custom'. If 'custom', you must provide a group matching csv named 'custom groups.csv'. This has to match the order which was used to make the properties and groups dataframe.
hyd_props:listofstrings, default['Gh','Hh','Cph','V']- Thermodynamic properties of hydration you want to regress group data for. Must match the property columns in the input file.
gas_props:listofstrings, default['Hig','Sig','Cpig']- Thermodynamic properties of formation of ideal gases you want to regress group data for. Must match the property columns in the input file.
def group_property_estimator(self, filename, props=None, ignore=None)-
Expand source code
def group_property_estimator(self, filename, props=None, ignore = None): """ Regress the properties of group matches based on thermodynamic properties of molecules and molecular group matches. Parameters ---------- filename : str Name of the file containing compounds, properties, and group matching data. props : list of strings Thermodynamic properties you want to regress group data for. Must match the property columns in the input file. ignore : list of strings Name of columns in the input file which are not thermodynamic properties and therefore should not be regressed. """ if not isinstance(props, list): props=self.props df1 = pd.read_csv(filename) if isinstance(ignore, list): df1.drop(ignore, axis=1, inplace=True) bad_props = [] for p in props: if p not in list(df1.columns): bad_props.append(p) props = [p for p in props if p not in bad_props] for dependent_param in props: df_data = df1.copy() # remove columns containing 0 groups df_data = df_data.loc[:, (df_data != 0).any(axis=0)] # get data subset that needs a prediction df_topred = df_data[np.isfinite(df_data[dependent_param]) == False] # get data subset that does not need a prediction df_not_topred = df_data[np.isfinite(df_data[dependent_param]) == True] # delete rows representing compounds that need a prediction when they contain a group that is not # represented in the training set. delrows = [] for col in df_not_topred.columns.values: try: if sum(df_not_topred[col]) == 0: for row in df_topred.index.values: if df_topred.loc[row, col] != 0: delrows.append(row) except: pass bad_df = df_topred.index.isin(delrows) df_topred = df_topred[~bad_df] # remove rows without y values df_data = df_data[np.isfinite(df_data[dependent_param])] X = df_data[[x for x in list(df_data.columns.values) if not x in ["compound", "formula", dependent_param]+props]].copy() y = df_data[dependent_param].copy() X_topred = df_topred[[x for x in list(df_topred.columns.values) if not x in ["compound", "formula", dependent_param]+props]].copy() if 'ig' not in dependent_param: ## add material point X.loc[:, "material point"] = [1]* len(X) X_topred.loc[:, "material point"] = [1]* len(X_topred) multi_reg = sm.OLS(y[0:], X[0:]).fit() # perform the multiple regression prediction = multi_reg.predict(X) # make the predictions from the multi_reg preds = multi_reg.predict(X_topred) material_point = 0 material_point_err = 0 group_property_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.params.values])) group_property_se_dict = dict(zip(X.columns.values, [round(val, 4) for val in multi_reg.bse.values])) pred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X.loc[idx], X.columns.values)])**0.5 for idx in X.index] topred_errs = [sum([n_group*group_property_se_dict[group]**2 for n_group, group in zip(X_topred.loc[idx], X_topred.columns.values)])**0.5 for idx in X_topred.index] comp_pred_df = pd.DataFrame({"compound":list(df_data["compound"]), "actual":df_data[dependent_param], "prediction":[round(pred+material_point, 2) for pred in prediction.values], "pred errs":[round(err, 2) for err in pred_errs]}) df_preds = pd.DataFrame({"compound":list(df_topred["compound"]), "actual":df_topred[dependent_param], "prediction":[round(pred+material_point, 2) for pred in preds.values], "pred errs":[round(err, 2) for err in topred_errs]}) df_final = pd.concat([comp_pred_df,df_preds]) #is this happening on the right axis? with pd.option_context('display.max_rows', None, 'display.max_columns', None): df_group_property = pd.DataFrame(group_property_dict.items(), columns=['group', 'value']) df_group_se = pd.DataFrame(group_property_se_dict.items(), columns=['group', 'std err']) save_as = filename.split('.csv')[0]+' regressed' df_final.to_csv(save_as+"_"+dependent_param+".csv", index=False) df_group_property.to_csv(save_as+"_"+dependent_param+"_group_property.csv", index=False) #reports 0s when not able to estimate df_group_se.to_csv(save_as+"_"+dependent_param+"_group_se.csv", index=False)Regress the properties of group matches based on thermodynamic properties of molecules and molecular group matches.
Parameters
filename:str- Name of the file containing compounds, properties, and group matching data.
props:listofstrings- Thermodynamic properties you want to regress group data for. Must match the property columns in the input file.
ignore:listofstrings- Name of columns in the input file which are not thermodynamic properties and therefore should not be regressed.
def tts(self,
repeats=100,
test_size=0.2,
filename=None,
output_name='stats df.csv',
show=True)-
Expand source code
def tts(self, repeats = 100, test_size = 0.2, filename = None, output_name = 'stats df.csv', show=True): """ Perform a train-test split on the thermodynamic data you will regress and estimate properties from. Parameters ---------- repeats: int, default 100 Number of semi-random iterations of train-test splitting to do. The iterations will be used to calculate statistics associated with the semi-random sampling process. test_size: float, default 0.2 Fraction of the database for each thermodynamic property that should be used as the test set. filename : str, default properties and groups.csv Common start to filenames that were generated in the group_property_estimaor() function. output_name : str, default 'stats df.csv' Name of the CSV file that will be generated, composed of the average and standard deviation of the RMSEs of estimations associated with each iterations. show : bool, default True Show barplot for training and test data for each property? """ if filename == None: filename = 'properties and groups.csv' df = pd.read_csv(filename) stats_df = pd.DataFrame() for prop_ind, p in enumerate(self.props): temp_props = self.props.copy() temp_props.remove(p) prop_df = df.loc[~df[p].isnull()].drop(temp_props, axis=1) prop_df.reset_index(inplace=True, drop=True) for c in prop_df.columns[2:-1]: if sum(prop_df[c]) == 0: prop_df.drop(c, axis=1, inplace=True) groups = list(prop_df.columns)[2:-1] data_len = len(prop_df) prop = prop_df.columns[1] compounds = [] for c in list(prop_df['compound']): if c not in compounds: compounds.append(c) ### sorting by least represented groups to make sampling easier below lengths = [] for g in groups: length = len(prop_df.loc[prop_df[g]>0]) #how many times is the group represented lengths.append(length) length_df = pd.DataFrame() length_df['groups'] = groups length_df['lengths'] = lengths length_df.sort_values(by = 'lengths', inplace=True) #least common groups first groups = list(length_df['groups']) ### choosing one rep per group reps = [] for g in groups: mols_w_data = [] temp_df = prop_df.loc[prop_df[g]>0].loc[~prop_df['compound'].isin(reps)] if len(temp_df)>0: for c in temp_df.compound: if c not in mols_w_data: mols_w_data.append(c) reps.append(random.sample(mols_w_data, 1)[0]) non_reps = [c for c in compounds if c not in reps] train = reps.copy() test = non_reps.copy() test_df = prop_df.loc[prop_df['compound'].isin(test)] train_df = prop_df.loc[prop_df['compound'].isin(train)] test_df.to_csv(p+'_test.csv', index=False) train_df.to_csv(p+'_train.csv', index=False) train_RMSEs = [] test_RMSEs = [] for r in range(0, repeats): test_df = pd.read_csv(p+'_test.csv') train_df = pd.read_csv(p+'_train.csv') ## bare min training set so all groups are represented total_df = pd.concat([train_df, test_df], ignore_index=True) compounds = [] test = [] train = [] for j in total_df.index: compound = total_df.loc[j, 'compound'] if compound not in compounds: compounds.append(compound) if compound in list(test_df['compound']) and compound not in test: test.append(compound) if compound in list(train_df['compound']) and compound not in train: train.append(compound) ## randomly harvesting more data for training set for i in range(0, len(test)): if len(total_df.loc[total_df['compound'].isin(test)])/len(total_df) > test_size: rand = random.sample(test, 1)[0] train.append(rand) test.remove(rand) test_df = total_df.loc[total_df['compound'].isin(test)] for i in test_df.index: test_df.loc[i, prop] = np.nan train_df = total_df.loc[total_df['compound'].isin(train)] total_df = pd.concat([train_df, test_df]) file_name = 'split_'+p+'_test.csv' total_df.to_csv(file_name, index=False) save_as = file_name.split('.csv')[0]+' regressed' sig_figs = 3 fixed_material_point = False estimate_material_point = True self.group_property_estimator(file_name, [p]) SEs=[] df3 = pd.read_csv(save_as+'_'+p+'.csv') df3 = df3.loc[~df3['actual'].isnull()] for c in train: temp_df=df3.loc[df3['compound']==c] actual=np.average(temp_df['actual'].values) pred = temp_df['prediction'].values[0] SEs.append((pred-actual)**2) train_RMSE = np.sqrt(np.nanmean(SEs)) train_RMSEs.append(train_RMSE) df3 = pd.read_csv('split_'+p+'_test regressed_'+p+'.csv') df3 = df3.loc[df3['actual'].isnull()].loc[~df3['prediction'].isnull()] comps = [] for k in df3.index: c = df3.loc[k, 'compound'] if c not in comps: comps.append(c) SEs = [] for c in comps: pred = df3.loc[df3['compound'] == c]['prediction'].values[0] actual = np.nanmean(df.loc[df['compound'] == c][p]) SEs.append((pred-actual)**2) test_RMSE = np.sqrt(np.nanmean(SEs)) test_RMSEs.append(test_RMSE) dataset1 = [] dataset2 = [] stats_df.loc[prop_ind, 'property'] = p dataset1.append(np.array(train_RMSEs)) dataset2.append(np.array(test_RMSEs)) avg1 = np.nanmean(train_RMSEs) avg2 = np.nanmean(test_RMSEs) stdev1 = np.std(train_RMSEs) stdev2 = np.std(test_RMSEs) stats_df.loc[prop_ind, 'train avg'] = avg1 stats_df.loc[prop_ind, 'train stdev'] = stdev1 stats_df.loc[prop_ind, 'test avg'] = avg2 stats_df.loc[prop_ind, 'test stdev'] = stdev2 stats_df.to_csv(output_name, index=False) if show == True: colors = ['lightblue','orange','red','blue','sienna','pink','gold','navy','black'] fig, ax = plt.subplots(1, len(self.props), figsize=(len(self.props)*2, 2)) for n, p in enumerate(self.props): test_val = stats_df.loc[n, 'test avg'] train_val = stats_df.loc[n, 'train avg'] test_err = stats_df.loc[n, 'test stdev'] train_err = stats_df.loc[n, 'train stdev'] ax[n].bar(0, train_val, yerr = train_err, capsize=5, color=colors[n], label='train', edgecolor='black') ax[n].bar(1, test_val, yerr = test_err, capsize=5, color=colors[n], label='test', edgecolor='black', alpha=0.5, hatch='//') ax[n].set_title(self.props[n]) ax[n].set_xticks([0, 1], ['train', 'test']) fig.suptitle('Train test splitting thermodynamic properties', y=1.1) ax[0].set_ylabel('RMSE') fig.subplots_adjust(wspace=0.3) plt.savefig('train test split barplots.pdf', bbox_inches='tight')Perform a train-test split on the thermodynamic data you will regress and estimate properties from.
Parameters
repeats:int, default100- Number of semi-random iterations of train-test splitting to do. The iterations will be used to calculate statistics associated with the semi-random sampling process.
test_size:float, default0.2- Fraction of the database for each thermodynamic property that should be used as the test set.
filename:str, defaultproperties and groups.csv- Common start to filenames that were generated in the group_property_estimaor() function.
output_name:str, default'stats df.csv'- Name of the CSV file that will be generated, composed of the average and standard deviation of the RMSEs of estimations associated with each iterations.
show:bool, defaultTrue- Show barplot for training and test data for each property?