jpandas.py

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import DataStructs

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model, externals

from IPython.display import display

# This is James Sungjin Kim's library
import jutil, jchem, jquinone, jgrid
import jfile
import j3x.jpyx

from maml.gp import gaussian_process as gp


def pd_find_SMILES( pdr, s, disp = False, smiles_id = 'SMILES'):
	pdw = pdr[ pdr[ smiles_id] == s]
	if disp:
		display( pdw)

	return pdw

def list_indices( l, target):
	return [i for i,val in enumerate(l) if val == target]

def get_duplist( x_list, disp = True):
	"""
	Duplication indices are returned.
	"""
	duplist = []
	for x in set( x_list):
		if x_list.count( x) > 1:
			duplist.append( list_indices( x_list, x))

	if disp:
		print(duplist)
		for d in duplist:
			print([x_list[x] for x in d])

	return duplist

def check_mol2smiles( x_smiles_1):  
	"""
	Find smiles codes which can not operate in rdkit now. 
	x_smiles_1 is refined by cannonical smiles generated by rdkit
	"""
	x_mol_list = [Chem.MolFromSmiles(x) for x in x_smiles_1]

	fail_list = []
	for ii in range( len(x_mol_list)):
		try: 
			x_smiles_1[ii] = Chem.MolToSmiles( x_mol_list[ii])
			#print ii, "Sucess" 
		except:
			print(ii, "Faliue")
			fail_list.append( ii)
			x_smiles_1[ii] = ''

	return fail_list


def get_mol2smiles( x_smiles_1):    
	"""
	Find smiles codes which can not operate in rdkit now. 
	x_smiles_1 is refined by cannonical smiles generated by rdkit
	"""
	x_mol_list = [Chem.MolFromSmiles(x) for x in x_smiles_1]

	fail_list = []
	for ii in range( len(x_mol_list)):
		try: 
			Chem.MolToSmiles( x_mol_list[ii])
			#print ii, "Sucess" 
		except:
			print(ii, "Faliue")
			fail_list.append( ii)
			x_smiles_1[ii] = ''

	return fail_list    

def pd_remove_no_mol2smiles( pdr, smiles_id = 'SMILES'):
	"""
	Find not working smiles codes
	"""
	s = pdr[ smiles_id].tolist()
	fail_list = get_mol2smiles( s)

	pdr = pd_remove_faillist_ID( pdr, fail_list)

	return pdr


def pd_refine_smiles( pdr, smiles_id = 'SMILES'):
	"""
	smiles codes are refined by rdkit. 
	"""
	s_l = pdr[ smiles_id]
	m_l = list(map( Chem.MolFromSmiles, s_l))
	new_s_l = list(map( Chem.MolToSmiles, m_l))

	pdr[ smiles_id] = new_s_l

	return pdr

def pd_clean_smiles( pdr, smiles_id = 'SMILES'):

	if 'ID' not in list(pdr.keys()):
		raise TypeError( 'pdr should have a key of ID.')

	print('1. All columns each with a smile code not supported in rdkit are removing.')
	pdr1 = pd_remove_no_mol2smiles( pdr, smiles_id = smiles_id)

	print('2. Smiles are refined by rdkit')
	pdr2 = pd_refine_smiles( pdr1, smiles_id = smiles_id)

	print('3. Removing columns with duplicated smiles codes.')
	print('   - you may check properties for the same smiles code molecules:')
	print('     pd_get_dup_smiles_and_property()')
	pdr3 = pd_remove_dup_smiles( pdr2, smiles_id = smiles_id)

	return pdr3


def pd_remove_duplist_ID( pdr, dup_l):

	pdr_ID_x = []
	for d in dup_l:
		pdr_ID_x.append([ pdr.ID.tolist()[x] for x in d])
	print('pdr_ID_x ->', pdr_ID_x)

	pdw = pdr
	for d in pdr_ID_x:
		for x in d[1:]:
			pdw = pdw[ pdw.ID != x]

	#print pdr.SMILES.shape, pdw.SMILES.shape
	return pdw

def pd_remove_faillist_ID( pdr, fail_l):
	"""
	copy ID first and then operate for deleting
	since pdw is chaning on the fly.
	Index of list and index of pd item can not be the same. 
	"""
	#pdr_ID_x = [ pdr.ID[x] for x in fail_l]
	pdr_ID_x = [ pdr.ID.tolist()[x] for x in fail_l]
	print("pdr_ID_x -> ", pdr_ID_x)

	pdw = pdr
	for x in pdr_ID_x:
		# If indexing is working, this becomes copy
		# since the length is not any longer the same
		pdw = pdw[ pdw.ID != x]

	#print [pdr.ID[ x] for x in fail_l]
	#print pdr.SMILES.shape, pdw.SMILES.shape
	return pdw  

def pd_check_mol2smiles( pd_smiles):

	smiles_l = pd_smiles.tolist()
	fail_l = check_mol2smiles( smiles_l)
	
	# since siles_l is changed, pd values are also changed.
	pd_smiles = smiles_l

	return fail_l

def pd_check_mol2smi( pdr, smiles_id = 'SMILES'):

	smiles_l = pdr[smiles_id].tolist()
	fail_l = check_mol2smiles( smiles_l)
	
	return fail_l

def pd_remove_dup_smiles( pdr, smiles_id = 'SMILES'):

	s_l = pdr[ smiles_id].tolist()
	d_l = get_duplist( s_l)

	print(d_l)

	new_pdr = pd_remove_duplist_ID( pdr, d_l)

	return new_pdr

def pd_get_fp_strings( pdr, radius = 4, nBits = 1024, smiles_id = 'SMILES'):
	"""
	Extract smiles codes and then convert them to fingerprint string list
	"""

	s_l = pdr[ smiles_id].tolist()
	m_l = list(map( Chem.MolFromSmiles, s_l))
	fp_s_l = [AllChem.GetMorganFingerprintAsBitVect(m, radius = radius, nBits = nBits).ToBitString() for m in m_l]

	return fp_s_l

def xM( s_l, radius = 4, nBits = 1024):
	
	m_l = list(map( Chem.MolFromSmiles, s_l))
	fp_l = [AllChem.GetMorganFingerprintAsBitVect(m, radius = radius, nBits = nBits) for m in m_l]
	
	return np.mat( fp_l)

def pd_get_xM( pdr, radius = 4, nBits = 1024, smiles_id = 'SMILES'):
	"""
	Extract smiles codes and then convert them to fingerprint matrix.
	"""

	s_l = pdr[ smiles_id].tolist()
	m_l = list(map( Chem.MolFromSmiles, s_l))
	fp_l = [AllChem.GetMorganFingerprintAsBitVect(m, radius = radius, nBits = nBits) for m in m_l]
	xM  = np.mat( fp_l)

	return xM

def pd_get_xM_N( pdr, radius = 4, nBits = 1024, smiles_id = 'SMILES', N = None):
	"""
	Extract smiles codes and then convert them to fingerprint matrix.
	Only the limited number of molecules are retrieved in order to reduce computational complexity. 
	"""
	if N:
		s_l = pdr[ smiles_id].tolist()[:N]
		m_l = list(map( Chem.MolFromSmiles, s_l))
		fp_l = [AllChem.GetMorganFingerprintAsBitVect(m, radius = radius, nBits = nBits) for m in m_l]
		xM  = np.mat( fp_l)
	else:
		return pd_get_xM( pdr, radius, nBits, smiles_id)

	return xM

pd_get_fpM = pd_get_xM

def pd_get_xM_MACCSkeys( pdr, smiles_id = 'SMILES'):
	"""
	Extract smiles codes and then convert them to fingerprint matrix.
	"""
	s_l = pdr[ smiles_id].tolist()

	return jchem.get_xM_MACCSkeys( s_l)

def pd_get_xM_molw( pdr, smiles_id = 'SMILES'):

	s_l = pdr[ smiles_id].tolist()
	return jchem.get_xM_molw( s_l)

def pd_get_xM_lasa( pdr, smiles_id = 'SMILES'):

	s_l = pdr[ smiles_id].tolist()
	return jchem.get_xM_lasa( s_l)

def pd_get_fpM_fromStr( pdr, fp_id = 'Fingerprint'):
	"""
	Extract fingerprint strings and then convert them to fingerprint matrix.
	"""

	s_l = pdr[ fp_id].tolist()
	fp_i_l2 = [list(map(int, x)) for x in s_l]
	xM  = np.mat( fp_i_l2)

	return xM

def pd_get_yV( pdr, y_id):
	return np.mat( pdr[ y_id]).T

def pd_get_dup_smiles_and_property( pdr, smiles_id = 'SMILES', property_id = 'Solubility_log_mol_l'):

	pdr1 = pd_refine_smiles( pdr, smiles_id = smiles_id)

	lst = get_duplist( pdr1[ smiles_id].tolist(), disp = False)

	print('SMILES --> Property-1(SMILES), Property-2(SMILES), ...')
	for ll in lst:
		print(pdr1[smiles_id][ ll[0]], '-->', end=' ') 
		for l0 in ll:
			print(l0, ":", pdr1[ property_id][l0], end=' ')
			delta = abs(pdr1[ property_id][ ll[0]] - pdr1[ property_id][l0])
			if delta > 0.1 * abs(pdr1[property_id][ ll[0]]):
				print("Large difference ({})".format( delta), end=' ')
			elif delta > 0.01 * abs(pdr1[property_id][ ll[0]]):
				print("Medium difference ({})".format( delta), end=' ')
		print()

	print('\n=========================================')
	print('Medium difference list: > 0.01 times')
	for ll in lst:
		for l0 in ll:
			delta = abs(pdr1[ property_id][ ll[0]] - pdr1[ property_id][l0])
			if delta > 0.01 * abs(pdr1[property_id][ ll[0]]):
				print(pdr1[smiles_id][ ll[0]], '-->', end=' ') 
				print(l0, ":", pdr1[ property_id][l0], end=' ')
				print("Difference ({})".format( delta))              

	print('\n=========================================')
	print('large difference list: > 0.1 times')
	for ll in lst:
		for l0 in ll:
			delta = abs(pdr1[ property_id][ ll[0]] - pdr1[ property_id][l0])
			if delta > 0.1 * abs(pdr1[property_id][ ll[0]]):
				print(pdr1[smiles_id][ ll[0]], '-->', end=' ') 
				print(l0, ":", pdr1[ property_id][l0], end=' ')
				print("Difference ({})".format( delta))              


"""
Class modules are described below,
while function modules are described above.
"""
class _PD_mlr_r0():
	def __init__(self, pdr, y_id = 'Solubility log(mol/L)', smiles_id = 'SMILES', preprocessing = False):
		
		if preprocessing:
			self.xM = jchem.calc_corr( pdr[ smiles_id].tolist())
		else: 
			self.xM = pd_get_fpM( pdr, smiles_id = smiles_id)

		self.yV = pd_get_yV( pdr, y_id = y_id) 

	def set_SVD(self):
		U,d,VT = np.linalg.svd( self.xM)
		self.xM = self.xM * VT.T

	def _val_vseq_mode_rand_r0( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		if mode['type'] == 'ridge':
			r_sqr, RMSE = jutil.mlr_val_vseq_ridge( self.xM, self.yV, vseq, alpha = mode['alpha'], disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode_rand( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		r_sqr, RMSE = self.val_vseq_mode( self.xM, self.yV, vseq, mode = mode, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])
			clf.fit( RMt, yEt)

		if disp: print('Training result')
		jutil.mlr_show( clf, RMt, yEt, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.mlr_show( clf, RMv, yEv, disp = disp, graph = graph)

		#if r_sqr < 0:
		#   print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr

		return r_sqr, RMSE


	def _val_vseq_mode_gpnorm( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt) / mode['norm']
			RMv_a, yEv_a = np.array( RMv), np.array( yEv) / mode['norm']
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		#if disp: print 'Training result'
		#jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv / mode['norm'], yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt)
			RMv_a, yEv_a = np.array( RMv), np.array( yEv)
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		if mode['type'] != 'maml_gp':
			if disp: print('Training result')
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'] == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'] == 'Lasso':
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'] == 'ElasticNet':
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'], normalize = True)
			elif mode['type'] == 'LassoLars':
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r1( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'] == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'] == 'Lasso':
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'] == 'ElasticNet':
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'])
			elif mode['type'] == 'LassoLars':
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE


	def val_vseq_ridge_rand_profile( self, alpha = .5, rate = 2, iterN = 10, disp = False, graph = False, hist = True):
		jutil.mlr_val_vseq_ridge_rand_profile( self.xM, self.yV, alpha = alpha, rate = rate, iterN = iterN, 
			disp = disp, graph = graph, hist = hist)

	def val_vseq_mode_rand_profile( self, mode, rate = 2, iterN = 10, disp = True, graph = False, hist = True):

		RM = self.xM
		yE = self.yV

		r2_rms_list = []
		for ii in range( iterN):
			vseq = jutil.choose( len( yE), int(len( yE) / rate));
			r_sqr, RMSE = self.val_vseq_mode( RM, yE, vseq, mode = mode, disp = disp, graph = graph)
			r2_rms_list.append( (r_sqr, RMSE))

		r2_list, rms_list = list(zip( *r2_rms_list))

		#Showing r2 as histogram
		pd_r2 = pd.DataFrame( {'r_sqr': r2_list})
		pd_r2.plot( kind = 'hist', alpha = 0.5)

		#Showing rms as histogram
		pd_rms = pd.DataFrame( {'rms': rms_list})
		pd_rms.plot( kind = 'hist', alpha = 0.5)

		print("average r2 and sd:", list(map( np.mean, [r2_list, rms_list])))

		return r2_list, rms_list

class _PD_mlr_r1(): # 2015-6-3
	def __init__(self, pdr, y_id = 'Solubility log(mol/L)', smiles_id = 'SMILES', 
			preprocessing = False, forwardpreprocessing = True):
		"""
		y normalization is not important for prediction. 
		X normalization seems to be useful but not confirmed yet.
		"""

		if preprocessing:
			self.A = jchem.calc_corr( pdr[ smiles_id].tolist())
			self.xM = self.A
		else: 
			self.xM_org = pd_get_fpM( pdr, smiles_id = smiles_id)
			self.xM = self.xM_org

		self.preprocessing = preprocessing
		self.forwardpreprocessing = forwardpreprocessing

		self.yV = pd_get_yV( pdr, y_id = y_id)
		#self.mean_yV = np.mean( yV)
		#self.yV = yV - self.mean_yV

	def set_SVD(self):
		U,d,VT = np.linalg.svd( self.xM)
		self.xM = self.xM * VT.T

	def reset_SVD(self):
		self.xM = self.xM_org

	def val_vseq_mode_seq( self, mode = {'type': 'ridge', 'alpha': 0.5}, st_val = 0, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = list(range( st_val, ly, rate))

		r_sqr, RMSE = self.val_vseq_mode( self.xM, self.yV, vseq, mode = mode, disp = disp, graph = graph)

		return r_sqr, RMSE


	def _val_vseq_mode_rand_r0( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		if mode['type'] == 'ridge':
			r_sqr, RMSE = jutil.mlr_val_vseq_ridge( self.xM, self.yV, vseq, alpha = mode['alpha'], disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode_rand( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		r_sqr, RMSE = self.val_vseq_mode( self.xM, self.yV, vseq, mode = mode, disp = disp, graph = graph)

		return r_sqr, RMSE


	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])
			clf.fit( RMt, yEt)

		if disp: print('Training result')
		jutil.mlr_show( clf, RMt, yEt, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.mlr_show( clf, RMv, yEv, disp = disp, graph = graph)

		#if r_sqr < 0:
		#   print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr

		return r_sqr, RMSE


	def _val_vseq_mode_gpnorm( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt) / mode['norm']
			RMv_a, yEv_a = np.array( RMv), np.array( yEv) / mode['norm']
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		#if disp: print 'Training result'
		#jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv / mode['norm'], yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt)
			RMv_a, yEv_a = np.array( RMv), np.array( yEv)
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		if mode['type'] != 'maml_gp':
			if disp: print('Training result')
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		if self.preprocessing and not self.forwardpreprocessing:
			RMt, yEt = RM[ t_seq, :-len(v_seq)], yE[ t_seq, 0]
			RMv, yEv = RM[ v_seq, :-len(v_seq)], yE[ v_seq, 0]          
		else:
			#This is general case
			RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
			RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'].lower() == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'].lower() == 'Lasso'.lower():
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'].lower() == 'ElasticNet'.lower():
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'], normalize = True)
			elif mode['type'].lower() == 'LassoLars'.lower():
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r1( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'] == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'] == 'Lasso':
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'] == 'ElasticNet':
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'])
			elif mode['type'] == 'LassoLars':
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE


	def val_vseq_ridge_rand_profile( self, alpha = .5, rate = 2, iterN = 10, disp = False, graph = False, hist = True):
		jutil.mlr_val_vseq_ridge_rand_profile( self.xM, self.yV, alpha = alpha, rate = rate, iterN = iterN, 
			disp = disp, graph = graph, hist = hist)

	def val_vseq_mode_rand_profile( self, mode, rate = 2, iterN = 10, disp = True, graph = False, hist = True):

		RM = self.xM
		yE = self.yV

		r2_rms_list = []
		for ii in range( iterN):
			vseq = jutil.choose( len( yE), int(len( yE) / rate));
			r_sqr, RMSE = self.val_vseq_mode( RM, yE, vseq, mode = mode, disp = disp, graph = graph)
			r2_rms_list.append( (r_sqr, RMSE))

		r2_list, rms_list = list(zip( *r2_rms_list))

		#Showing r2 as histogram
		pd_r2 = pd.DataFrame( {'r_sqr': r2_list})
		pd_r2.plot( kind = 'hist', alpha = 0.5)

		#Showing rms as histogram
		pd_rms = pd.DataFrame( {'rms': rms_list})
		pd_rms.plot( kind = 'hist', alpha = 0.5)

		print("average r2 and sd:", list(map( np.mean, [r2_list, rms_list])))

		return r2_list, rms_list

	def predict( self, new_smiles, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}):
		"""
		predict for new smiles codes
		"""
		if mode['type'].lower() == 'ridge':
			clf = linear_model.Ridge( alpha = mode['alpha'])
		else:
			raise TypeError('The requested mode is not supported yet.')

		#Find an weight vector
		clf.fit( self.xM, self.yV)

		#Predict for new molecules
		new_xM = jchem.gfpM( new_smiles)
		new_yV_pred = clf.predict( new_xM)

		return new_yV_pred

class PD_mlr(): # 2015-6-3
	def __init__(self, pdr, y_id = 'Solubility log(mol/L)', smiles_id = 'SMILES', 
			preprocessing = False, forwardpreprocessing = True):
		"""
		y normalization is not important for prediction. 
		X normalization seems to be useful but not confirmed yet.
		"""

		if preprocessing:
			self.A = jchem.calc_corr( pdr[ smiles_id].tolist())
			self.xM = self.A
		else: 
			self.xM_org = pd_get_fpM( pdr, smiles_id = smiles_id)
			self.xM = self.xM_org

		self.preprocessing = preprocessing
		self.forwardpreprocessing = forwardpreprocessing

		self.yV = pd_get_yV( pdr, y_id = y_id)
		#self.mean_yV = np.mean( yV)
		#self.yV = yV - self.mean_yV

	def set_SVD(self):
		U,d,VT = np.linalg.svd( self.xM)
		self.xM = self.xM * VT.T

	def reset_SVD(self):
		self.xM = self.xM_org

	def val_vseq_mode_seq( self, mode = {'type': 'ridge', 'alpha': 0.5}, st_val = 0, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = list(range( st_val, ly, rate))

		r_sqr, RMSE = self.val_vseq_mode( self.xM, self.yV, vseq, mode = mode, disp = disp, graph = graph)

		return r_sqr, RMSE


	def _val_vseq_mode_rand_r0( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		if mode['type'] == 'ridge':
			r_sqr, RMSE = jutil.mlr_val_vseq_ridge( self.xM, self.yV, vseq, alpha = mode['alpha'], disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode_rand( self, mode = {'type': 'ridge', 'alpha': 0.5}, rate = 2, disp = True, graph = True):
		"""
		The regression performed directly from the pdr.
		We define mode dictionary to enter various types of optimization method.

		"""
		ly = len( self.yV)
		vseq = jutil.choose( ly, int(ly / rate));

		r_sqr, RMSE = self.val_vseq_mode( self.xM, self.yV, vseq, mode = mode, disp = disp, graph = graph)

		return r_sqr, RMSE


	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])
			clf.fit( RMt, yEt)

		if disp: print('Training result')
		jutil.mlr_show( clf, RMt, yEt, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.mlr_show( clf, RMv, yEv, disp = disp, graph = graph)

		#if r_sqr < 0:
		#   print 'v_seq:', v_seq, '--> r_sqr = ', r_sqr

		return r_sqr, RMSE


	def _val_vseq_mode_gpnorm( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt) / mode['norm']
			RMv_a, yEv_a = np.array( RMv), np.array( yEv) / mode['norm']
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		#if disp: print 'Training result'
		#jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv / mode['norm'], yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r0( self, RM, yE, v_seq, mode = {'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""
		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		#Regression or prediction can be performed by the predefined type such as Ridge.
		if mode['type'] == 'ridge':
			print('Ridge: alpha =', mode['alpha'])
			clf = linear_model.Ridge( alpha = mode['alpha'])

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

		elif mode['type'] == 'maml_gp':
			RMt_a, yEt_a = np.array( RMt), np.array( yEt)
			RMv_a, yEv_a = np.array( RMv), np.array( yEv)
			jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

			# Training mode
			jgp_en.optimize_noise_and_amp()
			yEt_predict = np.mat( jgp_en.predicted_targets)
			print(yEt_predict.shape)

			# Validation mode
			jgp_en.run_gp()
			yEv_predict = np.mat( jgp_en.predicted_targets)
			print(yEv_predict.shape)

		if mode['type'] != 'maml_gp':
			if disp: print('Training result')
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def val_vseq_mode( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		if self.preprocessing and not self.forwardpreprocessing:
			RMt, yEt = RM[ t_seq, :-len(v_seq)], yE[ t_seq, 0]
			RMv, yEv = RM[ v_seq, :-len(v_seq)], yE[ v_seq, 0]          
		else:
			#This is general case
			RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
			RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'].lower() == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'].lower() == 'Lasso'.lower():
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'].lower() == 'ElasticNet'.lower():
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'], normalize = True)
			elif mode['type'].lower() == 'LassoLars'.lower():
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE

	def _val_vseq_mode_r1( self, RM, yE, v_seq, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, disp = True, graph = True):
		"""
		Validation is peformed using vseq indexed values.
		"""

		if 'tool' not in list(mode.keys()):
			if mode['type'] in ('maml_gp'):
				mode['tool'] = 'AAG'
			else:
				mode['tool'] = 'sklearn'

		org_seq = list(range( len( yE)))
		t_seq = [x for x in org_seq if x not in v_seq]

		RMt, yEt = RM[ t_seq, :], yE[ t_seq, 0]
		RMv, yEv = RM[ v_seq, :], yE[ v_seq, 0]

		if mode['tool'] == 'sklearn': 
			if mode['type'] == 'ridge':
				print('Ridge: alpha =', mode['alpha'])
				clf = linear_model.Ridge( alpha = mode['alpha'])
			elif mode['type'] == 'Lasso':
				print('Lasso: alpha =', mode['alpha'])
				clf = linear_model.Lasso( alpha = mode['alpha'])
			elif mode['type'] == 'ElasticNet':
				print('ElasticNet: alpha = {0}, l1_ratio = {1}'.format( mode['alpha'], mode['l1_ratio']))
				clf = linear_model.ElasticNet( alpha = mode['alpha'], l1_ratio = mode['l1_ratio'])
			elif mode['type'] == 'LassoLars':
				print('LassoLars: alpha =', mode['alpha'])
				clf = linear_model.LassoLars( alpha = mode['alpha'])
			else:
				raise TypeError("The given mode is not supported yet or spells are different.")

			# Training mode
			clf.fit( RMt, yEt)
			yEt_predict = clf.predict( RMt)

			# Validation mode
			yEv_predict = clf.predict( RMv)

			if disp: print('Training result')
			#print yEt_predict[:10] #For debugging          
			jutil.regress_show( yEt, yEt_predict, disp = disp, graph = graph)

		elif mode['tool'] == 'AAG':
			if mode['type'] == 'maml_gp':
				RMt_a, yEt_a = np.array( RMt), np.array( yEt)
				RMv_a, yEv_a = np.array( RMv), np.array( yEv)
				jgp_en = gp.GaussianProcess( RMt_a, yEt_a, RMv_a, yEv_a)

				# Training mode
				jgp_en.optimize_noise_and_amp()
				yEt_predict = np.mat( jgp_en.predicted_targets)
				print(yEt_predict.shape)

				# Validation mode
				jgp_en.run_gp()
				yEv_predict = np.mat( jgp_en.predicted_targets)
				print(yEv_predict.shape)
		else:
			raise TypeError("{} is not support for mode-tool yet.".format( mode['tool']))

		if disp: print('Validation result')
		r_sqr, RMSE = jutil.regress_show( yEv, yEv_predict, disp = disp, graph = graph)

		return r_sqr, RMSE


	def val_vseq_ridge_rand_profile( self, alpha = .5, rate = 2, iterN = 10, disp = False, graph = False, hist = True):
		jutil.mlr_val_vseq_ridge_rand_profile( self.xM, self.yV, alpha = alpha, rate = rate, iterN = iterN, 
			disp = disp, graph = graph, hist = hist)

	def val_vseq_mode_rand_profile( self, mode, rate = 2, iterN = 10, disp = True, graph = False, hist = True):

		RM = self.xM
		yE = self.yV

		r2_rms_list = []
		for ii in range( iterN):
			vseq = jutil.choose( len( yE), int(len( yE) / rate));
			r_sqr, RMSE = self.val_vseq_mode( RM, yE, vseq, mode = mode, disp = disp, graph = graph)
			r2_rms_list.append( (r_sqr, RMSE))

		r2_list, rms_list = list(zip( *r2_rms_list))

		#Showing r2 as histogram
		pd_r2 = pd.DataFrame( {'r_sqr': r2_list})
		pd_r2.plot( kind = 'hist', alpha = 0.5)

		#Showing rms as histogram
		pd_rms = pd.DataFrame( {'rms': rms_list})
		pd_rms.plot( kind = 'hist', alpha = 0.5)

		print("average r2 and sd:", list(map( np.mean, [r2_list, rms_list])))

		return r2_list, rms_list

	def predict( self, new_smiles, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}):
		"""
		predict for new smiles codes
		"""
		if mode['type'].lower() == 'ridge':
			clf = linear_model.Ridge( alpha = mode['alpha'])
		else:
			raise TypeError('The requested mode is not supported yet.')

		#Find an weight vector
		clf.fit( self.xM, self.yV)

		#Predict for new molecules
		new_xM = jchem.gfpM( new_smiles)
		new_yV_pred = clf.predict( new_xM)

		return new_yV_pred

class PD_Regress(PD_mlr): # 2015-6-3
	"""
	Extended from PR_mlr for including high-level prediction functionality. 
	"""
	def __init__(self, pdr, y_id = 'Solubility log(mol/L)', smiles_id = 'SMILES', 
			preprocessing = False, forwardpreprocessing = True):
		"""
		y normalization is not important for prediction. 
		X normalization seems to be useful but not confirmed yet.
		  X normalization can be done per column basis for feature regression
		  and done per row basis for molecule basis 
		"""

		self.pdr = pdr
		self.smiles_id = smiles_id

		self.preprocessing = preprocessing
		self.forwardpreprocessing = forwardpreprocessing

		self.xM_org = pd_get_fpM( pdr, smiles_id = smiles_id)
		if preprocessing:
			self.A = jchem.calc_corr( pdr[ smiles_id].tolist())
			self.xM = self.A
		else: 
			self.xM = self.xM_org

		self.preprocessing = preprocessing
		self.forwardpreprocessing = forwardpreprocessing

		self.yV = pd_get_yV( pdr, y_id = y_id)

	def predict( self, new_smiles, mode = {'tool': 'sklearn', 'type': 'ridge', 'alpha': 0.5}, addDescriptor = True):
		# print "Predict in PD_Regress is performing now."

		if mode['type'].lower() == 'ridge':
			clf = linear_model.Ridge( alpha = mode['alpha'])
		else:
			raise TypeError('The requested mode is not supported yet.')

		if self.preprocessing:
			ss_merge = self.pdr[ self.smiles_id].tolist() + new_smiles
			A_merge_all = jchem.calc_corr( ss_merge)
			
			if self.forwardpreprocessing:
				A_merge = A_merge_all[:self.yV.shape[0],:]
				A_new = A_merge_all[self.yV.shape[0]:,:]
			else:
				A_merge = A_merge_all[:self.yV.shape[0],:self.yV.shape[0]]
				A_new = A_merge_all[self.yV.shape[0]:,:self.yV.shape[0]]

			if addDescriptor:
				molw_l = [ Chem.rdMolDescriptors.CalcExactMolWt( Chem.MolFromSmiles(x)) for x in ss_merge]
				Features_merge = jchem.add_new_descriptor( A_merge, molw_l[:self.yV.shape[0]])
				Features_new = jchem.add_new_descriptor( A_new, molw_l[self.yV.shape[0]:])
			else:
				# Original features set are used
				Features_merge = A_merge
				Features_new = A_new

			#clf.fit( A_merge, self.yV)
			#new_yV_pred = clf.predict( A_new)
		else:
			#Find an weight vector
			xM_new = jchem.gfpM( new_smiles)

			if addDescriptor:
				ss_merge = self.pdr[ self.smiles_id].tolist() + new_smiles
				molw_l = [ Chem.rdMolDescriptors.CalcExactMolWt( Chem.MolFromSmiles(x)) for x in ss_merge]
				Features_merge = jchem.add_new_descriptor( self.xM, molw_l[:self.yV.shape[0]])
				Features_new = jchem.add_new_descriptor( xM_new, molw_l[self.yV.shape[0]:])
			else:
				Features_merge = self.xM
				Features_new = xM_new

		clf.fit( Features_merge, self.yV)
		new_yV_pred = clf.predict( Features_new)

		return new_yV_pred


def collect_same_sm( sm_l, te_l):
	"""
	It collect property values which have the same SMILES code in the list
	sm_l: list of smiles codes
	te_l: list of property values associated with a smiles code 
	"""
	unique_sm_l = []
	unique_te_l = []
	min_te_l = []
	te_flag = False
	sm_prev = None
	ln = len( sm_l)
	for idx, (sm, te) in enumerate(zip( sm_l, te_l)):
		if idx == 0:
			prev_sm = sm
			prev_te_all = [te]
		elif idx == ln - 1:
			if sm != prev_sm: 
				unique_sm_l.append( prev_sm)
				unique_te_l.append( prev_te_all)
				min_te_l.append( min( prev_te_all))

				unique_sm_l.append( sm)
				unique_te_l.append( [te])
				min_te_l.append( te)
			else:
				prev_te_all.append( te)

				unique_sm_l.append( prev_sm)
				unique_te_l.append( prev_te_all)            
				min_te_l.append( min( prev_te_all))
		else: 
			if sm != prev_sm: 
				unique_sm_l.append( prev_sm)
				unique_te_l.append( prev_te_all)
				min_te_l.append( min( prev_te_all))
				
				prev_sm = sm
				prev_te_all = [te]
			else:
				prev_te_all.append( te)
	
	return unique_sm_l, unique_te_l, min_te_l

def pd_collect_same_sm( pdr, y_id = 'total_energy', smiles_id = 'SMILES'):
	sm_l = pdr.SMILES.tolist()
	te_l = pdr.total_energy.tolist()

	unique_sm_l, unique_te_l, min_te_l = collect_same_sm( sm_l, te_l)

	pdw = pd.DataFrame()
	pdw['ID'] = list(range( 1, len(unique_sm_l)+1))
	pdw['SMILES'] = unique_sm_l
	pdw['min_total_energy'] = min_te_l
	pdw['total_energy'] = unique_te_l

	return pdw

def pdw_collect_same_sm( pdr, fname, y_id = 'total_energy', smiles_id = 'SMILES'):
	pdw = pd_collect_same_sm( pdr, y_id = y_id, smiles_id = smiles_id)
	print('Size of DataFrame:', pdw.shape)

	pdw.to_csv( fname, index = False)


class PD_MBR_Solubility():
	def __init__(self, fname_model = 'sheet/model.pkl', mode = 'offline', 
			fname_db = 'sheet/ws_all_smiles_496.csv', smiles_id = 'SMILES', y_id = 'Solubility_log_mol_l',
			graph = True, disp = True): 
		# or mode = 'online'

		# Now graph is turned on and off by a parameter of graph.
		self.graph = graph
		self.disp = disp

		self.fname_model = fname_model
		self.fname_db = fname_db

		if mode == 'offline':
			self.lm = externals.joblib.load( self.fname_model) 

		self.smiles_id = smiles_id
		self.y_id = y_id

		if self.disp: print('=== Read data ===')
		self.pdr = pd.read_csv( fname_db)
		self.s_l = self.pdr[ self.smiles_id].tolist()
		self.yV = pd_get_yV( self.pdr, y_id = self.y_id)


	def modeling(self, alpha = 0.1):
		print('I. extract SMILES dn yV')
		plt.hist( self.yV)
		plt.show()

		print('II. generate xM - morgan, maccs, weight')
		xM = dict()
		xM['morgan'] = pd_get_xM( self.pdr, radius=6, nBits=4096)
		xM['maccs'] = pd_get_xM_MACCSkeys( self.pdr)
		xM['weight'] = pd_get_xM_molw( self.pdr)

		mode_l = list(xM.keys())
		print('[xM[mode].shape for mode in mode_l] =')
		print([xM[mode].shape for mode in mode_l])

		print('III. Convert to A1, A2 and AE = A1+A2+W')     
		A = dict()
		for mode in ['morgan', 'maccs']:
			print(mode)
			A[mode] = j3x.jpyx.calc_tm_sim_M( xM[mode])
			print(A[mode].shape)
	
		AW_ensemble = np.concatenate( (A['morgan'], A['maccs'], xM['weight']), axis = 1)
		print('AW_ensemble.shape =', AW_ensemble.shape)

		print('IV. Fitting process')
		lm = linear_model.Ridge( alpha = alpha)
		lm.fit( AW_ensemble, self.yV)
		yV_pred = lm.predict( AW_ensemble)

		jutil.regress_show( self.yV, yV_pred)

		print('V. dump model to', self.fname_model)
		externals.joblib.dump(lm, self.fname_model)     

		self.lm = lm

	def predict(self, fname_data = 'sheet/diphenoquinone-only.csv'):

		print('I. Load prediction data')
		pdr_data = pd.read_csv( fname_data)

		data_s_l = pdr_data.SMILES.tolist()
		db_data_s_l = list()
		db_data_s_l.extend( self.s_l)
		db_data_s_l.extend( data_s_l)
		print('len( db_data_s_l ) = ', len( db_data_s_l )) 

		print('III. generate xM - morgan, maccs, weight')
		xM_db_data = dict()
		xM_db_data['morgan'] = jchem.get_xM( db_data_s_l, radius=6, nBits=4096)
		xM_db_data['maccs'] = jchem.get_xM_MACCSkeys( db_data_s_l)
		xM_db_data['weight'] = jchem.get_xM_molw( db_data_s_l)

		A_db_data = dict()
		for mode in ['morgan', 'maccs']:
			print(mode)
			A_db_data[mode] = j3x.jpyx.calc_tm_sim_M( xM_db_data[mode])
			print(A_db_data[mode].shape)

		print('IV. Convert to A1, A2 and AE = A1+A2+W')              
		AW_db_data_ensemble = np.concatenate( (A_db_data['morgan'][:,:len( self.s_l)], 
								A_db_data['maccs'][:,:len( self.s_l)], xM_db_data['weight']), axis = 1)
		print(AW_db_data_ensemble.shape)

		print('V. Prediction and Confirm using db data')
		yV_data_db_pred = self.lm.predict( AW_db_data_ensemble)
		if self.graph: 
			print('Training results for confirmation:')
			jutil.regress_show( self.yV, yV_data_db_pred[:len(self.s_l),0])

		print('VI. Save results')

		pdw_data = pdr_data.copy()
		pdw_data['Solubility'] = yV_data_db_pred[len(self.s_l):,0].tolist()
		fname_db_base = os.path.basename( self.fname_db)
		pdw_data.to_csv( fname_data[:-4] + '-sol-' + fname_db_base, index = False)

		return yV_data_db_pred[len(self.s_l):,0]

	def predict_fname(self, fname_data = 'sheet/diphenoquinone-only.csv'):
		"""
		The original function codes are separated into two parts which 
		are a data loading part and a prediction part. Moreover,
		the prediction part is used in common with single smiles. 
		Even if predict() and predict_fname() are equivalent, the original
		predict() is still remained and later it will be prohibited once
		the functionality of predict_fanme() is proved to be the same with 
		predict().
		"""

		print('I. Load prediction data')
		pdr_data = pd.read_csv( fname_data)

		yV = self.predict_pdr( pdr_data)

		print('VI. Save results')
		pdw_data = pdr_data.copy()
		pdw_data['Solubility'] = yV.tolist()
		# The final result can be referred by self.PDR_SOL
		self.PDR_SOL = pdw_data

		# The result filename can be accessed using FNAME_SOL
		fname_db_base = os.path.basename( self.fname_db)
		self.FNAME_SOL = fname_data[:-4] + '-sol-' + fname_db_base
		pdw_data.to_csv( self.FNAME_SOL, index = False)

		return yV		

	def predict_pdr( self, pdr_data):
		"""
		All print command are changed to be disable if self.disp is False. 
		"""

		if self.disp: print('I-2. Extract SMILES strings')
		data_s_l = pdr_data.SMILES.tolist()
		db_data_s_l = list()
		db_data_s_l.extend( self.s_l)
		db_data_s_l.extend( data_s_l)
		if self.disp: print('len( db_data_s_l ) = ', len( db_data_s_l )) 

		if self.disp: print('III. generate xM - morgan, maccs, weight')
		xM_db_data = dict()
		xM_db_data['morgan'] = jchem.get_xM( db_data_s_l, radius=6, nBits=4096)
		xM_db_data['maccs'] = jchem.get_xM_MACCSkeys( db_data_s_l)
		xM_db_data['weight'] = jchem.get_xM_molw( db_data_s_l)

		A_db_data = dict()
		for mode in ['morgan', 'maccs']:
			#print mode
			A_db_data[mode] = j3x.jpyx.calc_tm_sim_M( xM_db_data[mode])
			#print A_db_data[mode].shape

		if self.disp: print('IV. Convert to A1, A2 and AE = A1+A2+W')              
		AW_db_data_ensemble = np.concatenate( (A_db_data['morgan'][:,:len( self.s_l)], 
								A_db_data['maccs'][:,:len( self.s_l)], xM_db_data['weight']), axis = 1)
		if self.disp: print(AW_db_data_ensemble.shape)

		if self.disp: print('V. Prediction and Confirm using db data')
		yV_data_db_pred = self.lm.predict( AW_db_data_ensemble)

		if self.graph:
			if self.disp: print('Training results for confirmation:')
			jutil.regress_show( self.yV, yV_data_db_pred[:len(self.s_l),0])

		return yV_data_db_pred[len(self.s_l):,0]


	def predict_smiles(self, smiles_l):
		"""
		Now a single smiles string is used to predict solubility. 
		Hence, pdr_data is generated used this string. 
		"""

		#print 'I-1. Load prediction data'
		#pdr_data = pd.read_csv( fname_data)
		#print smiles_l
		pdr_data = pd.DataFrame( {'SMILES': smiles_l})

		return self.predict_pdr( pdr_data)


	def score( self, fname_data = 'sheet/diphenoquinone-only.csv', y_id = 'exp'):

		pdr_data = pd.read_csv( fname_data)
		yV = pd_get_yV( pdr_data, y_id = y_id)
		yV_pred = self.predict( fname_data)

		print('Testing results:')
		jutil.regress_show( yV, yV_pred)


	def score_divide( self, fname_data = 'sheet/diphenoquinone-only.csv', y_id = 'exp', Npart = 1000):

		Nfile = pd_divide( fname_data, Npart = Npart)

		# Prediction is performed
		for npart in range( Nfile):
			print(npart, 'is proceeding now...')
			fname_part = fname_data[:-4] + '_' + str(npart) + '.csv'
			self.score( fname_part, y_id = 'solvation_free_energy')

		pd_concat( fname_data, self.fname_db, Nfile = Nfile)

	def predict_divide( self, fname_data = 'sheet/diphenoquinone-only.csv', y_id = 'exp', Npart = 1000):

		Nfile = pd_divide( fname_data, Npart = Npart)

		# Prediction is performed
		for npart in range( Nfile):
			print(npart, 'is proceeding now...')
			fname_part = fname_data[:-4] + '_' + str(npart) + '.csv'
			self.predict( fname_part)

		pd_concat( fname_data, self.fname_db, Nfile = Nfile)

class PD_MBR_Solubility_Fast( PD_MBR_Solubility):
	def __init__(self, fname_model = 'sheet/model.pkl', mode = 'offline', 
			fname_db = 'sheet/ws_all_smiles_496.csv', smiles_id = 'SMILES', y_id = 'Solubility_log_mol_l',
			graph = True, disp = True): 
		
		PD_MBR_Solubility.__init__(self, fname_model = fname_model, mode = mode, 
									fname_db = fname_db, smiles_id = smiles_id, y_id = y_id, graph = graph, disp = disp)

		fname_model_fast = fname_model[:-4]
		if mode == 'offline':
			if self.disp: 
				print('Calculated xM_db[morgan] and xM_db[maccs] will be loaded.')
				print('Loading file name:', fname_model_fast + '_xM_db_dict.pkl')
			self.predict_pdr_pri_by_load( fname_model_fast)
		else:
			if self.disp: 
				print('xM_db[morgan] and xM_db[maccs] will be calculated and saved.')
				print('Saving file name:', fname_model_fast + '_xM_db_dict.pkl')
			self.predict_pdr_pri_and_save( fname_model_fast)

	def predict_pdr_pri_and_save( self, fname_model_fast):
		self.predict_pdr_pri()

		jfile.save_obj( self.xM_db, fname_model_fast + '_xM_db_dict')

	def predict_pdr_pri_by_load( self, fname_model_fast):
		#self.predict_pdr_pri()

		self.xM_db = jfile.load_obj( fname_model_fast + '_xM_db_dict')

	def predict_pdr_pri( self):
		"""
		The xM for training data should be stored for fast calculation.		
		Otherwise, it should be calculated every time. 
		Probably generating fingerprint takes no long time, while
		calculation of similarity takes more time. 
		Hence, for the first step I will reduce to calculate similarity of a new molecular group
		instead of calculating both all training molecules and target molecules. 
		"""

		# At this time, this will be generated while it will be loaded in the next time. 
		# The descriptors for non-binary values are not useful to predict new molecules. 
		# Therefore, it will not be calculated any longer and will not be saved and loaded even in the later version.
		if self.disp: print('I-1. Generate sets of fingerprints for db.')
		self.xM_db = dict()
		self.xM_db['morgan'] = jchem.get_xM( self.s_l, radius=6, nBits=4096)
		self.xM_db['maccs'] = jchem.get_xM_MACCSkeys( self.s_l)
		#self.xM_db['weight'] = jchem.get_xM_molw( self.s_l)

	def predict_pdr( self, pdr_data):
		# self.predict_pdr_pri()
		return self.predict_pdr_post( pdr_data)

	def predict_pdr_post( self, pdr_data):
		"""
		All print command are changed to be disable if self.disp is False. 
		This function will be upgraded to improve the prediction speed. 
		Now it is not fast in order to calculate the training molecules whenever generating their descriptors.

		I found that using lsar will give more performance advanced in solubility prediction than using only molw. 
		Hence, lsar will be included as a descriptor and TM will be replaced by other one. 
		"""

		# This function will generate dictionary of xM_db,
		# which will be loaded from the saved file instead of calculated every time 
		# for enhancing prediction speed. 
		if self.disp: print('I-2. Extract SMILES strings')
		data_s_l = pdr_data.SMILES.tolist()

		xM_data = dict()
		xM_data['morgan'] = jchem.get_xM( data_s_l, radius=6, nBits=4096)
		xM_data['maccs'] = jchem.get_xM_MACCSkeys( data_s_l)
		xM_data['weight'] = jchem.get_xM_molw( data_s_l)

		# A_db is not used any longer since we will calculate only prediction for new data. 
		A_data = dict()
		for mode in ['morgan', 'maccs']:
			# Now similarity is calculating only for xM_data based on xM_db,
			# while previously, both prediction is performed for both xM_data and xM_db
			# which takes much more than predicting only xM_data especially when 
			# the number of molecules for predicting is much less than the number of training molecules. 
			A_data[mode] = j3x.jpyx.calc_tm_sim_MM( xM_data[mode], self.xM_db[mode])
			
		if self.disp: print('IV. Convert to A1, A2 and AE = A1+A2+W')              
		AW_data_ensemble = np.concatenate( (A_data['morgan'], 
								A_data['maccs'], xM_data['weight']), axis = 1)
		if self.disp: print(AW_data_ensemble.shape)

		if self.disp: print('V. Prediction and Confirm using db data')
		yV_data_pred = self.lm.predict( AW_data_ensemble)

		return yV_data_pred


class PD_MBR_Solubility_nomolw( PD_MBR_Solubility):

	def __init__(self, fname_model = 'sheet/model.pkl', mode = 'offline', 
			fname_db = 'sheet/ws_all_smiles_496.csv', smiles_id = 'SMILES', y_id = 'Solubility_log_mol_l'): 

		self.fname_model = fname_model
		self.fname_db = fname_db

		if mode == 'offline':
			self.lm = externals.joblib.load( self.fname_model) 

		self.smiles_id = smiles_id
		self.y_id = y_id

		print('=== Read data ===')
		self.pdr = pd.read_csv( fname_db)
		self.s_l = self.pdr[ self.smiles_id].tolist()
		self.yV = pd_get_yV( self.pdr, y_id = self.y_id)

		#super(PD_MBR_Solubiilty_nomolw, self).__init__(fname_model, mode, fname_db, smiles_id, y_id)

	def modeling(self, alpha = 0.1):
		print('I. extract SMILES dn yV')
		plt.hist( self.yV)
		plt.show()

		print('II. generate xM - morgan, maccs')
		xM = dict()
		xM['morgan'] = pd_get_xM( self.pdr, radius=6, nBits=4096)
		xM['maccs'] = pd_get_xM_MACCSkeys( self.pdr)
		# xM['weight'] = pd_get_xM_molw( self.pdr)

		mode_l = list(xM.keys())
		print('[xM[mode].shape for mode in mode_l] =')
		print([xM[mode].shape for mode in mode_l])

		print('III. Convert to A1, A2 and AE = A1+A2')
		A = dict()
		for mode in ['morgan', 'maccs']:
			print(mode)
			A[mode] = j3x.jpyx.calc_tm_sim_M( xM[mode])
			print(A[mode].shape)
	
		AW_ensemble = np.concatenate( (A['morgan'], A['maccs']), axis = 1)
		print('AW_ensemble.shape =', AW_ensemble.shape)

		print('IV. Fitting process')
		lm = linear_model.Ridge( alpha = alpha)
		lm.fit( AW_ensemble, self.yV)
		yV_pred = lm.predict( AW_ensemble)

		jutil.regress_show( self.yV, yV_pred)

		print('V. dump model to', self.fname_model)
		externals.joblib.dump(lm, self.fname_model)     

		self.lm = lm

	def predict(self, fname_data = 'sheet/diphenoquinone-only.csv'):

		print('I. Load prediction data')
		pdr_data = pd.read_csv( fname_data)

		data_s_l = pdr_data.SMILES.tolist()
		db_data_s_l = list()
		db_data_s_l.extend( self.s_l)
		db_data_s_l.extend( data_s_l)
		print('len( db_data_s_l ) = ', len( db_data_s_l )) 

		print('III. generate xM - morgan, maccs')
		xM_db_data = dict()
		xM_db_data['morgan'] = jchem.get_xM( db_data_s_l, radius=6, nBits=4096)
		xM_db_data['maccs'] = jchem.get_xM_MACCSkeys( db_data_s_l)
		# xM_db_data['weight'] = jchem.get_xM_molw( db_data_s_l)

		A_db_data = dict()
		for mode in ['morgan', 'maccs']:
			print(mode)
			A_db_data[mode] = j3x.jpyx.calc_tm_sim_M( xM_db_data[mode])
			print(A_db_data[mode].shape)

		print('IV. Convert to A1, A2 and AE = A1+A2+W')              
		AW_db_data_ensemble = np.concatenate( (A_db_data['morgan'][:,:len( self.s_l)], 
			A_db_data['maccs'][:,:len( self.s_l)]), axis = 1)
		print(AW_db_data_ensemble.shape)

		print('V. Prediction and Confirm using db data')
		yV_data_db_pred = self.lm.predict( AW_db_data_ensemble)
		print('Training results for confirmation:')
		jutil.regress_show( self.yV, yV_data_db_pred[:len(self.s_l),0])

		print('VI. Save results')

		pdw_data = pdr_data.copy()
		pdw_data['Solubility'] = yV_data_db_pred[len(self.s_l):,0].tolist()
		fname_db_base = os.path.basename( self.fname_db)
		pdw_data.to_csv( fname_data[:-4] + '-sol-' + fname_db_base, index = False)

		return yV_data_db_pred[len(self.s_l):,0]

class PD_RedoxPotential( PD_MBR_Solubility):
	def __init__(self, fname_model = None, mode = 'offline',
			fname_db = 'sheet/cxcalc/Flavin533_2SMILES_P-RP(pH10).csv', 
			smiles_id_l = ['SMILES', 'R-SMILES'], y_id = 'RP at pH10'):
		# The other parameters are equivalent to the super class except one additional SMILES string.

		if fname_model == None:
			fname_model = fname_db[:-4] + ".pkl"

		#super(PD_RedoxPotential, self).__init__(fname_model, mode, fname_db, smiles_id_l[0], y_id)
		PD_MBR_Solubility.__init__(self, fname_model, mode, fname_db, smiles_id_l[0], y_id)

		self.r_smiles_id = smiles_id_l[1]
		self.rs_l = self.pdr[ self.r_smiles_id].tolist()

	def modeling(self, alpha = 0.1):
		print('I. extract SMILES and yV')
		plt.hist( self.yV)
		plt.show()

		print('II. generate xM - morgan, maccs')
		#s_l = self.pdr[self.smiles_id].tolist()
		#rs_l = self.pdr[self.r_smiles_id].tolist()

		xM1_l = jchem.get_xM( self.s_l)
		xM1_r = jchem.get_xM( self.rs_l)
		xM2_l = jchem.get_xM_MACCSkeys( self.s_l)
		xM2_r = jchem.get_xM_MACCSkeys( self.rs_l)
		# yV =  jpd.pd_get_yV( self.pdr, 'RP at pH10')

		A1_l = j3x.jpyx.calc_tm_sim_M( xM1_l)
		A1_r = j3x.jpyx.calc_tm_sim_M( xM1_r)
		A2_l = j3x.jpyx.calc_tm_sim_M( xM2_l)
		A2_r = j3x.jpyx.calc_tm_sim_M( xM2_r)
		AW_ensemble = np.concatenate( [A1_l, A1_r, A2_l, A2_r], axis = 1)
		
		print("AW_ensemble.shape is", AW_ensemble.shape)

		print('III. Fitting process')
		lm = linear_model.Ridge( alpha = alpha)
		lm.fit( AW_ensemble, self.yV)
		yV_pred = lm.predict( AW_ensemble)

		jutil.regress_show3( self.yV, yV_pred)

		print('V. dump model to', self.fname_model)
		externals.joblib.dump(lm, self.fname_model)     

		self.lm = lm

	def get_2A_db_data(self, db_s_l, data_s_l):
		"""
		Since two As are needed for each basic and reduced forms of a molecule,
		we use this function to calculate the two As of Morgan and MACCS for each form. 
		"""
		# data_s_l = pdr_data.SMILES.tolist()
		db_data_s_l = list()
		db_data_s_l.extend( db_s_l)
		db_data_s_l.extend( data_s_l)
		print('len( db_data_s_l ) = ', len( db_data_s_l )) 

		xM_db_data = dict()
		xM_db_data['morgan'] = jchem.get_xM( db_data_s_l)
		xM_db_data['maccs'] = jchem.get_xM_MACCSkeys( db_data_s_l)

		A_db_data = dict()
		for mode in ['morgan', 'maccs']:
			A_db_data[mode] = j3x.jpyx.calc_tm_sim_M( xM_db_data[mode])

		return A_db_data

	def predict(self, fname_data = 'sheet/cxcalc/flavins-mix3R_PlogSpH.csv'):

		print('I. Load prediction data')
		pdr_data = pd.read_csv( fname_data)

		data_s_l = pdr_data[self.smiles_id].tolist()
		data_rs_l = pdr_data[self.r_smiles_id].tolist()

		A  = self.get_2A_db_data( self.s_l, data_s_l)
		rA = self.get_2A_db_data( self.rs_l, data_rs_l)

		print('IV. Convert to A1, A2 for morgan and rA1, rA2 for maccs, so A1+rA1 + A2+rA2')              
		AW_db_data_ensemble = np.concatenate( (
			A['morgan'][:,:len( self.s_l)],	rA['morgan'][:,:len( self.rs_l)], 
			A['maccs'][:,:len( self.s_l)], rA['maccs'][:,:len( self.rs_l)]), axis = 1)
		print(AW_db_data_ensemble.shape)

		print('V. Prediction and Confirm using db data')
		yV_data_db_pred = self.lm.predict( AW_db_data_ensemble)
		print('Training results for confirmation:')
		jutil.regress_show3( self.yV, yV_data_db_pred[:len(self.s_l),0])

		print('VI. Save results')

		pdw_data = pdr_data.copy()
		pdw_data['P-RP'] = yV_data_db_pred[len(self.s_l):,0].tolist()
		fname_db_base = os.path.basename( self.fname_db)
		pdw_data.to_csv( fname_data[:-4] + '-sol-' + fname_db_base, index = False)

		return yV_data_db_pred[len(self.s_l):,0]


def pd_get( fname_csv = 'sheet/ws496.csv'):
	"""
	We assume that similes_id is SMILES and y_id is exp. 
	"""
	pdr = pd.read_csv( fname_csv)
	xM = pd_get_xM( pdr)
	yV = pd_get_yV( pdr, y_id = 'exp')

	return pdr, xM, yV


def pd_get_bclass( fname_csv = 'sheet/ws496.csv', th = -3):
	"""
	From now on, to improve compatibility with sklearn 
	array is used instead of np.mat. 
	"""

	pdr, xM, yV = pd_get( fname_csv)

	xM = np.array( xM)
	yVc = np.array([ 1 if y >= th else 0 for y in yV[:,0]])

	return pdr, xM, yVc

def pd_divide( fname, Npart = 1000):
	"""
	Divide one csv file to multiple csv files. 
	"""
	pdr36796 = pd.read_csv( fname)

	Nall = pdr36796.shape[0]
	Nfile = int( np.ceil( float(Nall) / Npart))

	for npart in range( Nfile):
		n_start = npart * Npart
		n_end1 = (npart + 1) * Npart
		if n_end1 > Nall:
			n_end1 = Nall    
		pdr_part = pdr36796[n_start:n_end1]
		pdr_part.to_csv( fname[:-4] + '_' + str(npart) + '.csv', index = False)

	print(Nfile, 'is saved.')

	return Nfile

def pd_concat( fname_all, fname_db, Nfile):

	fname_db_base = os.path.basename( fname_db)
	for npart in range( Nfile):
		print(npart, 'is proceeding now...')
		fname_data = fname_all[:-4] + '_' + str(npart) + '.csv'
		fname_part = fname_data[:-4] + '-sol-' + fname_db_base

		pdw_part = pd.read_csv( fname_part)
		if npart == 0:
			pdw36796 = pdw_part
		else:        
			pdw36796 = pd.concat( [pdw36796, pdw_part])
		print('Shape becomes', pdw36796.shape)

	pdw36796.to_csv( fname_all[:-4] + '-sol-' + fname_db_base, index = False)

def pd_aq1x( dfr_rafa, fg_l = [ '', '(C(=O)O)', '(N(C)C)', '(P(=O)(O)O)', '(O)', '(S(=O)(=O)O)']):
	"""
	This function can generate aq with a functional group for a various functional groups. 
	Moreover, it matches with current database to compare with current information 
	using SMILES. 
	"""

	pdw_l = list()

	for fg in fg_l:    
		cs_l = jquinone.aq1x( fg)
		# print cs_l
		for cs in cs_l:
			pdw_l.append( pd_find_SMILES( dfr_rafa, cs))

	pdw = pd.concat( pdw_l)

	pdw['BaseName'] = [ 'AQ'] * pdw.shape[0]	
	pdw['Base'] = ['O=C1c2c{0}c{1}ccc2C(=O)c2ccccc21'] * pdw.shape[0]
	fg2_l = []
	for x in fg_l:
		fg2_l.extend( [x, x])
	# print fg2_l
	pdw['R-Group'] = fg2_l
	pdw['R-Position'] = [1, 2] * int(pdw.shape[0]/2)

	pdw = pdw[1:] #No attachment cases is reduced to only one case
	pdw.to_csv( 'sheet/aq1x11.csv', index = False)

	return pdw

def pd_bq1x( dfr_rafa, fg_l = [ '', '(C(=O)O)', '(P(=O)(O)O)', '(O)', '(S(=O)(=O)O)']):
	"""
	This function can generate aq with a functional group for a various functional groups. 
	Moreover, it matches with current database to compare with current information 
	using SMILES. 

	- Only 4 R-groups and no R-group are considered.
	"""

	mol = {'s': 'C1=CC(=O)C{0}=CC1=O', 'l': 1}

	pdw_l = list()

	for fg in fg_l:    
		cs_l = jquinone.mol1x( mol['s'], mol['l'], fg)
		#print cs_l
		for cs in cs_l:
			pdw = pd_find_SMILES( dfr_rafa, cs)
			if pdw.shape[0] > 0: 
				pdw_l.append( pd_find_SMILES( dfr_rafa, cs))
			else:
				raise TypeError( 'R-group of {} does not exist in the database!'.format( fg))

	#print len( pdw_l)
	pdw = pd.concat( pdw_l)
	#print pdw.shape

	pdw['BaseName'] = [ 'BQ'] * pdw.shape[0]	
	pdw['Base'] = [ mol['s']] * pdw.shape[0]

	fg2_l = []
	for x in fg_l:
		fg2_l.extend( [x] * mol['l'])
	# print fg2_l
	pdw['R-Group'] = fg2_l
	pdw['R-Position'] = list(range( mol['l'])) * len( fg_l)

	pdw = pdw[ mol['l'] - 1:] #No attachment cases is reduced to only one case
	pdw.to_csv( 'sheet/bq1x11.csv', index = False)

	return pdw

def pd_nq1x( dfr_rafa, fg_l = [ '', '(C(=O)O)', '(P(=O)(O)O)', '(O)', '(S(=O)(=O)O)']):
	"""
	This function can generate aq with a functional group for a various functional groups. 
	Moreover, it matches with current database to compare with current information 
	using SMILES. 

	- Only 4 R-groups and no R-group are considered.
	"""

	mol = {'s': 'O=C1C{0}=CC(=O)c2c{1}c{2}ccc21', 'l': 3}

	pdw_l = list()

	for fg in fg_l:    
		cs_l = jquinone.mol1x( mol['s'], mol['l'], fg)
		#print cs_l
		for cs in cs_l:
			pdw = pd_find_SMILES( dfr_rafa, cs)
			if pdw.shape[0] > 0: 
				pdw_l.append( pd_find_SMILES( dfr_rafa, cs))
			else:
				raise TypeError( 'R-group of {} does not exist in the database!'.format( fg))

	#print len( pdw_l)
	pdw = pd.concat( pdw_l)
	#print pdw.shape

	pdw['BaseName'] = [ 'NQ'] * pdw.shape[0]	
	pdw['Base'] = [ mol['s']] * pdw.shape[0]

	fg2_l = []
	for x in fg_l:
		fg2_l.extend( [x] * mol['l'])
	# print fg2_l
	pdw['R-Group'] = fg2_l
	pdw['R-Position'] = list(range( mol['l'])) * len( fg_l)

	pdw = pdw[ mol['l'] - 1:] #No attachment cases is reduced to only one case
	pdw.to_csv( 'sheet/nq1x11.csv', index = False)

	return pdw	

def pd_sub( pdr, s_l, p = 'S(=O)(=O)O', graph = False): 
	"""
	pdr: whole database
	s_l: list of SMILES codes
	p: substructure pattern to search
	"""

	c_l, r_l = jchem.matches( s_l, p)
	n_l = len( np.where( np.array(c_l) > 0)[0])
	print(n_l)

	pdw = pdr.copy()
	pdw['c_l'] = c_l
	pdw['r_l'] = r_l
	pdw_r = pdw[ pdw['c_l'] > 0]
	# pdw_r.to_csv( 'sheet/wang3664_{}.csv'.format( p), index = False)
	
	print('mean( logS) for R-group {} is'.format(p), np.mean( pdw_r.exp))
	if graph:
		plt.hist( pdw_r.exp.values)
		for s, e in zip(pdw_r.SMILES, pdw_r.exp):
			print(s, e)
			jchem.show_mol( s)

	return pdw_r

def file_subs( fname = 'sheet/wang3664_smiles.csv', 
		p_l = ['C(=O)O', 'P(=O)(O)O', 'S(=O)(=O)O', 'N(C)C', 'O'], graph = False):

	pdr = pd.read_csv( fname)
	s_l = pdr.SMILES.tolist()

	for p in p_l: 
		pdw_r = pd_sub( pdr, s_l, p, graph = graph)
		fname_p = fname[:-4] + '_' + p + '.csv'
		print('Search results are saved to', fname_p)
		pdw_r.to_csv( fname_p, index = False)
		if graph:
			plt.show()

def pd_cxcalc_csv(rfile = 'quinones45.8p', sfile = 'quinones45.smiles'):
	"""
	cxcalc results will be saved into one fles so as to be used for analysis more efficiently.
	"""
	pda = pd.read_csv( rfile, sep='\t')
	pda_smiles = pd.read_csv( sfile, sep='\t', header=None)
	pda['Aromatic portion'] = pda["Aromatic atom count"] / pda["Atom count"]
	pda['SMILES'] = pda_smiles[0]
	pda['Compound name'] = pda_smiles[1]
	pda.to_csv(rfile[:-2] + 'csv', index = False)

def	gr_beststd( gr):
	"""
	Find std in gr for the corresponding alpha
	"""
	#best_alpha = gr.best_params_['alpha']
	for sc in gr.grid_scores_:
		if sc.parameters['alpha'] == gr.best_params_['alpha']:
			return np.std(sc.cv_validation_scores)

def pd_gscv( pdr, method, xM, yV, alphas_log, colname = 'Predicted-RP', fname = 'sheet/rafa36795_cxcalc_prp1000.csv'):
	"""
	This run grid search, perform cross-validation for plotting and save the predicted values,
	"""

	print("1. Searching the best hyper-parameter by a grid method.")
	gr = jgrid.gs( method, xM, yV, alphas_log)
	print(gr.grid_scores_)
	print("Best alpha:", gr.best_params_['alpha'])

	print("2. Predicting the property using the best hyper-parameter and show a x-y plot")
	yV_pred = jgrid.cv( 'Lasso', xM, yV, alpha = gr.best_params_['alpha'], grid_std = gr_beststd(gr))

	print("3. Saving the predicted results in crossvalidation into", fname)
	pdw = pdr.copy()
	pdw[ colname] = yV_pred.tolist()
	pdw.to_csv( fname, index = False)

	print("4. Saving the best estimator as a pkl file")
	print(gr.best_estimator_)
	externals.joblib.dump(gr.best_estimator_, fname[:-3] +  "pkl")

def pd_predict_lm( fname_model, fname_data, N = None):
	"""
	First, it reads model, which has extension of pkl. 
	Seconnd, it loads input data from pdr so that it generates xM
	where xM is the concatenation of the left and the right SMILES strings. 

	Now, there is no reft SMILES strings in Kaisang data, which should be generated late
	in order to use this prediction code. 
	"""
	print("1. The saved model is loaded where the file name of the model is", fname_model)
	lm = externals.joblib.load( fname_model)

	print("2. The data is loaded from", fname_data, "only for N=", N)
	pdr = pd.read_csv( fname_data) 
	
	print("3. The prediction is performed.")
	if N is None:
		N = pdr.shape[0]

	xM_R = pd_get_xM_N( pdr, smiles_id = 'R-SMILES', N = N)
	xM_H = pd_get_xM_N( pdr, smiles_id = 'SMILES', N = N)
	xM = np.concatenate( [xM_R, xM_H], axis = 1)
	#yV = jpd.pd_get_yV( pdr, y_id = 'redox_potential')[:N,:]

	yV_pred = lm.predict( xM)

	pdw = pdr[:N].copy()
	pdw['predicted_redox_potential'] = yV_pred
	fname_model_base = os.path.basename( fname_model)
	fname_save = fname_data[:-4] + str(N) + '-rp-' + fname_model_base[:-4] + '.csv'

	print("4. The predicted data are save to", fname_save)
	pdw.to_csv( fname_save, index = False)

	return yV_pred

def pd_rp_predict_lm_each( fname_model, smiles, smiles_rd):
	"""
	First, it reads model, which has extension of pkl. 
	Seconnd, it loads input data from pdr so that it generates xM
	where xM is the concatenation of the left and the right SMILES strings. 

	Now, there is no reft SMILES strings in Kaisang data, which should be generated late
	in order to use this prediction code. 
	"""
	#print "1. The saved model is loaded where the file name of the model is", fname_model
	lm = externals.joblib.load( fname_model)

	#print "2. The prediction is performed."
	xM_R = jchem.get_xM( [smiles])
	xM_H = jchem.get_xM( [smiles_rd])
	xM = np.concatenate( [xM_R, xM_H], axis = 1)

	yV_pred = lm.predict( xM)

	return yV_pred	

def pd_group( pdr, colname, val):
	"""
	Returns the target group data after remving the associated column
	"""
	return pdr[ pdr[ colname] == val].drop( colname, 1).reset_index(drop=True)

def pd_performnace( pdr, x_id = 'Predicted-RP', y_id = 'redox_potential'):
	"""
	The performance of regression is calculated from the regression results. 
	Its performance metrics and the x-y scattering graph are shown.
	r2, RMSE, AAE = pd_performnace( pdr = pdr36743, x_id = 'Predicted-RP', y_id = 'redox_potential'):
	"""
	x = pdr[x_id]
	y = pdr[y_id]
	xM = np.mat( x).T
	yV = np.mat( y).T    
	return jutil.regress_show3( yV, xM)

def list_or( pdr, cn, c_l):
	"""
	Find indices which has one of list elements.
	"""
	cond = [False] * pdr.shape[0]
	for c in c_l:
		cond |= pdr[ cn] == c
	
	return cond

def pd_show_dupset_by( df, subset = "SLN"):
	df_dup = df.duplicated( subset)
	dup_set = np.where(df_dup==True)[0]
	print("Dupplication set")
	print( dup_set)
	df_l = []
	for i, s in enumerate(dup_set):
		df_dup = df[ df[subset] == df[subset][s]].copy()
		df_dup["Pair"] = i
		df_l.append( df_dup)
	df_pair = pd.concat( df_l)
	# Return full column information without abbrevation. 
	return df_pair