Source code for cartgen

"""
``cartgen`` is module with models which could be used with sklearn library.




Copyright (C) 2021 Evgenii Tsatsorin eugtsa@gmail.com 
Full license in LICENSE file.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import random
import numpy as np
import logging
from cartesian_genetics_base.cartesian_genome_func import CartesianGenomeFunc

[docs]class CartGenModel: """``CartGenModel`` is a class with model which could process any ML task (regression, classification, multiclass, etc). It utilizes sklearn interface for usage. It consists of simple generations-based optimizer for ``CartesianGenomeFunc`` and any custom metric. Args: metric_to_minimize (callable): metric function with signature analogous to sklearn (see sklearn metrics) to minimize n_generations (int): number of generations to evolve samples_in_gen (int): number of samples in each generation for each elitary sample elitarity_n (int): number of elite best samples to save mutation_points (int): number of points to mutate in each elitary sample on new samples acquisition tqdm (callable): tqdm function with signature: lambda x: x . Use tqdm or tqdm_notebook from tqdm package n_inputs (int): number on inputs n_outputs (int): number of outputs depth (int): depth of genome func representation n_rows (int): number of functions on each layer of depth recurse_depth (int): depth of previous layers allowed to transmit inputs to each next level arity (int): arity of basis functions, if not set then would be determined automatically on given basis seed (int): random seed for random operations (init_random_genome and such) full_mutate_prob (float): probability of all possible mutation occurs for some individual basis_funcs (list): list of callable, basis functions for genome func representations cgf (CartesianGenomeFunc) : function to use as cgf if you don't want to create one Examples: :: import numpy as np from cartgen import CartGenModel from tqdm import tqdm_notebook from sklearn.datasets import load_digits from sklearn.metrics import mean_absolute_error from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from sklearn.preprocessing import StandardScaler dataset = load_digits() data,target = dataset['data'],dataset['target'] data,target = shuffle(data,target,random_state=1) data = StandardScaler().fit_transform(data) X_train,X_test,y_train,y_test = train_test_split(data,target,test_size=0.33,random_state=42) def sqrt(x): return np.sqrt(np.abs(x)) def log(x): return np.log(np.abs(x+0.00001)) def summ(x,y): return x+y def diff(x,y): return x-y def div(x,y): return x/(y+0.1) def neg(x): return -x def mult(x,y): return x*y def div_2(x): return x/2 def mult_3(x): return x*3 def abss(x): return np.abs(x) basis = [sqrt,log,neg,summ,mult,div,abss,div_2,mult_3,diff] model = CartGenModel(metric_to_minimize=mean_absolute_error, n_generations=150, samples_in_gen=50, mutation_points=3, elitarity_n=9, tqdm=tqdm_notebook, n_inputs=X_train.shape[1], n_outputs=1, depth=36, recurse_depth=9, basis_funcs=basis, seed=9, n_rows=1) model.fit(X_train,y_train) test_preds = model.predict(X_test) print(mean_absolute_error(test_preds,y_test)) :: from sklearn.ensemble import BaggingRegressor bclf = BaggingClassifier(base_estimator=model, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) bclf.fit(X_train,y_train) test_preds = bclf.predict(X_test) print(mean_absolute_error(test_preds,y_test)) """ def __init__(self, metric_to_minimize=None, n_generations = 20, samples_in_gen = 50, elitarity_n = 5, mutation_points = 3, tqdm=None, n_inputs = None, n_outputs = None, depth = None, n_rows = 1, basis_funcs = None, recurse_depth = 5, arity = None, full_mutate_prob = 0.0, seed = None, cgf = None): """CGP Model for ML. Uses regression with cartesian genome function, optimized with elitarity N+lambda genetic process Args: metric_to_minimize (callable): metric function with signature analogous to sklearn (see sklearn metrics) to minimize n_generations (int): number of generations to evolve samples_in_gen (int): number of samples in each generation for each elitary sample elitarity_n (int): number of elite best samples to save mutation_points (int): number of points to mutate in each elitary sample on new samples acquisition tqdm (callable): tqdm function with signature: lambda x: x . Use tqdm or tqdm_notebook from tqdm package n_inputs (int): number on inputs n_outputs (int): number of outputs depth (int): depth of genome func representation n_rows (int): number of functions on each layer of depth recurse_depth (int): depth of previous layers allowed to transmit inputs to each next level arity (int): arity of basis functions, if not set then would be determined automatically on given basis seed (int): random seed for random operations (init_random_genome and such) full_mutate_prob (float): probability of all possible mutation occurs for some individual basis_funcs (list): list of callable, basis functions for genome func representations cgf (CartesianGenomeFunc) : function to use as cgf if you don't want to create one Returns: CartesianGenomeFunc: constructed CG function representation """ self.n_generations = n_generations self.samples_in_gen = samples_in_gen self.elitarity_n = elitarity_n self.mutation_points = mutation_points self.recurse_depth = recurse_depth self.n_inputs = n_inputs self.n_outputs = n_outputs self.depth = depth self.n_rows = n_rows self.basis_funcs = basis_funcs self.full_mutate_prob = full_mutate_prob self.recurse_depth = 5 self.arity = arity self.seed = seed if cgf is not None and isinstance(cgf, CartesianGenomeFunc): self.cgf = cgf self.not_fitted_yet = False else: self.cgf = CartesianGenomeFunc(n_inputs=n_inputs, n_outputs=n_outputs, depth=depth, n_rows=n_rows, basis_funcs=basis_funcs, recurse_depth=recurse_depth, arity=arity, seed=seed) self.not_fitted_yet = True if self.arity is None: self.arity = self.cgf._arity if seed is not None: random.seed(seed) self.metric_to_minimize = metric_to_minimize self.tqdm = tqdm if tqdm is None: self.tqdm = lambda x: x def _set_initial_params(self, arity, basis_funcs, cgf, depth, elitarity_n, metric_to_minimize, mutation_points, n_generations, n_inputs, n_outputs, n_rows, recurse_depth, samples_in_gen, seed, tqdm,full_mutate_prob): self.n_generations = n_generations self.samples_in_gen = samples_in_gen self.elitarity_n = elitarity_n self.mutation_points = mutation_points self.recurse_depth = recurse_depth self.n_inputs = n_inputs, self.n_outputs = n_outputs self.depth = depth self.n_rows = n_rows self.basis_funcs = basis_funcs self.recurse_depth = 5 self.full_mutate_prob = full_mutate_prob self.arity = arity self.seed = seed if cgf is not None and isinstance(cgf, CartesianGenomeFunc): self.cgf = cgf self.not_fitted_yet = False else: self.cgf = CartesianGenomeFunc(n_inputs=n_inputs, n_outputs=n_outputs, depth=depth, n_rows=n_rows, basis_funcs=basis_funcs, recurse_depth=recurse_depth, arity=arity, seed=seed) self.not_fitted_yet = True if seed is not None: random.seed(seed) self.metric_to_minimize = metric_to_minimize self.tqdm = tqdm if tqdm is None: self.tqdm = lambda x: x def _get_mutated_samples(self, in_sample, n_points=1, new_samples_count=10, full_mutate_prob=0.0): while new_samples_count != 0: points_to_do = n_points new_sample = [v for v in in_sample] while points_to_do > 0: mutate_point = random.choice([i for i in range(len(in_sample))]) new_sample[mutate_point] = random.random() points_to_do -= 1 if full_mutate_prob>0: if random.random()<full_mutate_prob: if mutate_point%3!=0: n_to_mutate = self.arity*self.n_rows else: n_to_mutate = len(self.basis_funcs) for i in range(n_to_mutate): returned_sample = [v for v in new_sample] returned_sample[mutate_point] = float(i)/n_to_mutate yield returned_sample new_samples_count -= 1 continue yield new_sample new_samples_count -= 1
[docs] def get_params(self, deep=False): """Get parameters of fitted estimator (sklearn interface here: https://scikit-learn.org/stable/developers/develop.html#cloning) Args: deep(bool): sklearn parameter stub Returns: dict with parameters of estimator """ return {'metric_to_minimize':self.metric_to_minimize, 'n_generations':self.n_generations, 'samples_in_gen': self.samples_in_gen, 'elitarity_n': self.elitarity_n, 'mutation_points' :self.mutation_points, 'tqdm':self.tqdm, 'n_inputs':self.n_inputs, 'n_outputs':self.n_outputs, 'depth':self.depth, 'n_rows':self.n_rows, 'basis_funcs':self.basis_funcs, 'recurse_depth':self.depth, 'arity':self.arity, 'seed':self.seed, 'cgf':self.cgf, 'full_mutate_prob':self.full_mutate_prob}
[docs] def set_params(self,**params): """Set parameters of fitted estimator (sklearn interface here: https://scikit-learn.org/stable/developers/develop.html#cloning) Args: params(kwargs): parameters kwargs Returns: CartGenModel: model with parameters from kwargs """ if params: self._set_initial_params(**params) return self
[docs] def fit(self, X, y): """Fit X and y: run genetic evolution for some generations and acquire best learned CGF Args: X (numpy.array): numpy array matrix with features to learn y (numpy.array): numpy array matrix with target to learn Returns: CartGenModel: learned model with best learned self._cgf """ already_scored_cgp = dict() cgf = self.cgf cgf.init_random_genome() preds = cgf.call([X[:, i] for i in range(X.shape[1])])[0] self._top_scores = [self.metric_to_minimize(preds, y) for _ in range(self.elitarity_n)] self._top_genomes = [cgf.get_genome() for _ in range(self.elitarity_n)] # learning genome for some generations for gen in self.tqdm(range(self.n_generations)): for elitary_mutated_genomes in zip(*[ self._get_mutated_samples(self._top_genomes[_i_], n_points=self.mutation_points, new_samples_count=self.samples_in_gen, full_mutate_prob = self.full_mutate_prob) for _i_ in range(len(self._top_genomes))]): for new_sample in elitary_mutated_genomes: if tuple(new_sample) in already_scored_cgp: continue already_scored_cgp[tuple(new_sample)] = 1 cgf.set_genome(new_sample) new_preds = cgf.call([X[:, i] for i in range(X.shape[1])])[0] new_score = self.metric_to_minimize(new_preds, y) last_bigger = None already_in = False for i_, old_score in enumerate(self._top_scores): if new_score <= old_score: last_bigger = i_ if (last_bigger is not None) and not already_in: self._top_scores[last_bigger] = new_score self._top_genomes[last_bigger] = cgf.get_genome() # setting learned genome to self._cgf self.cgf.set_genome(self._top_genomes[-1]) self.not_fitted_yet = False return self
[docs] def predict(self, X): """Predict X by running best fitted CGF function Args: X (numpy.array): numpy array matrix with features to learn y (numpy.array): numpy array matrix with target to learn Returns: CartGenModel: learned model with best learned self._cgf """ if self.not_fitted_yet: logging.error('Model is not fitted! Use fit method or set_params method first!') raise NotImplementedError() test_preds = self.cgf.call([X[:, i] for i in range(X.shape[1])]) return np.vstack(test_preds).T