AshmithaIRRI commited on
Commit
8ff1f8d
·
verified ·
1 Parent(s): cdec802

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -184
app.py CHANGED
@@ -1,31 +1,10 @@
1
- # -*- coding: utf-8 -*-
2
  """
3
- Created on Tue Jan 28 09:12:48 2025
4
-
5
- @author: Ashmitha
6
- """
7
-
8
- # -*- coding: utf-8 -*-
9
- """
10
- Created on Sun Nov 24 12:47:37 2024
11
-
12
- @author: Ashmitha
13
- """
14
-
15
- # -*- coding: utf-8 -*-
16
- """
17
- Created on Sun Nov 24 12:25:57 2024
18
-
19
- @author: Ashmitha
20
- """
21
-
22
- # -*- coding: utf-8 -*-
23
- """
24
- Created on Sat Nov 9 15:44:40 2024
25
 
26
  @author: Ashmitha
27
  """
28
 
 
29
  import pandas as pd
30
  import numpy as np
31
  import gradio as gr
@@ -48,57 +27,40 @@ from xgboost import XGBRegressor
48
  import io
49
  from sklearn.feature_selection import SelectFromModel
50
  import tempfile
51
-
52
- #-------------------------------------Feature selection---------------------------------------------------------------------------------------------
53
-
54
- def RandomForestFeatureSelection(trainX, trainy, num_features=60):
55
- rf = RandomForestRegressor(n_estimators=1000, random_state=50)
56
- rf.fit(trainX, trainy)
57
-
58
- # Get feature importances
59
- importances = rf.feature_importances_
60
-
61
- # Select the top N important features
62
- indices = np.argsort(importances)[-num_features:]
63
- return indices
64
- #----------------------------------------------------------GRU Model---------------------------------------------------------------------
65
-
66
-
67
- def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
68
-
69
- # Apply feature selection using Random Forest Regressor
70
- if feature_selection:
71
- # Use RandomForestRegressor to rank features by importance
72
- rf = RandomForestRegressor(n_estimators=100, random_state=42)
73
- rf.fit(trainX, trainy)
74
-
75
- # Select features with importance greater than a threshold (e.g., mean importance)
76
- selector = SelectFromModel(rf, threshold="mean", prefit=True)
77
- trainX = selector.transform(trainX)
78
- if testX is not None:
79
- testX = selector.transform(testX)
80
- print(f"Selected {trainX.shape[1]} features based on feature importance.")
81
-
82
- # Scale the input data using MinMaxScaler to normalize the feature range
83
- scaler = MinMaxScaler()
84
- trainX_scaled = scaler.fit_transform(trainX)
85
- if testX is not None:
86
- testX_scaled = scaler.transform(testX)
87
-
88
- # Scale the target variable using MinMaxScaler
89
- target_scaler = MinMaxScaler()
90
- trainy_scaled = target_scaler.fit_transform(trainy.reshape(-1, 1)) # Reshape to 2D for scaler
91
-
92
  # Reshape trainX and testX to be 3D: (samples, timesteps, features)
93
- trainX = trainX_scaled.reshape((trainX.shape[0], 1, trainX.shape[1])) # Adjusted for general feature count
94
  if testX is not None:
95
- testX = testX_scaled.reshape((testX.shape[0], 1, testX.shape[1])) # Reshape testX if it exists
96
-
 
97
  model = Sequential()
98
-
99
  # GRU Layer
100
- model.add(GRU(512, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=False, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
101
-
 
102
  # Dense Layers with Batch Normalization, Dropout, LeakyReLU
103
  model.add(Dense(256, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
104
  model.add(BatchNormalization())
@@ -109,31 +71,31 @@ def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
109
  model.add(BatchNormalization())
110
  model.add(Dropout(dropout_rate))
111
  model.add(LeakyReLU(alpha=0.1))
112
-
113
  model.add(Dense(64, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
114
  model.add(BatchNormalization())
115
  model.add(Dropout(dropout_rate))
116
  model.add(LeakyReLU(alpha=0.1))
117
-
118
  model.add(Dense(32, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
119
  model.add(BatchNormalization())
120
  model.add(Dropout(dropout_rate))
121
  model.add(LeakyReLU(alpha=0.1))
122
-
123
  # Output Layer with ReLU activation to prevent negative predictions
124
  model.add(Dense(1, activation="relu"))
125
-
126
  # Compile the model
127
  model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
128
-
129
  # Callbacks for learning rate reduction and early stopping
130
  learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose=1, factor=0.5, min_lr=1e-6)
131
  early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
132
-
133
  # Train the model
134
- history = model.fit(trainX, trainy_scaled, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
135
  callbacks=[learning_rate_reduction, early_stopping])
136
-
137
  # Predict train and test
138
  predicted_train = model.predict(trainX)
139
  predicted_test = model.predict(testX) if testX is not None else None
@@ -142,30 +104,11 @@ def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
142
  predicted_train = predicted_train.flatten()
143
  if predicted_test is not None:
144
  predicted_test = predicted_test.flatten()
145
- else:
146
- predicted_test = np.zeros_like(predicted_train)
147
-
148
- # Inverse scale the predictions to get them back to original range
149
- predicted_train = target_scaler.inverse_transform(predicted_train.reshape(-1, 1)).flatten()
150
- if predicted_test is not None:
151
- predicted_test = target_scaler.inverse_transform(predicted_test.reshape(-1, 1)).flatten()
152
 
153
  return predicted_train, predicted_test, history
154
 
155
-
156
-
157
-
158
- #-----------------------------------------------------------DeepMap-------------------------------------------------------------------------------
159
  def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
160
- if feature_selection:
161
- rf=RandomForestRegressor(n_estimators=100,random_state=60)
162
- rf.fit(trainX,trainy)
163
-
164
- selector=SelectFromModel(rf, threshold="mean",prefit=True)
165
- trainX=selector.transform(trainX)
166
- if testX is not None:
167
- testX=selector.transform(testX)
168
- print(f"Selected {trainX.shape[1]} feature based on the important feature")
169
 
170
 
171
 
@@ -183,10 +126,14 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
183
  model = Sequential()
184
 
185
  # Convolutional layers
186
- model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
187
  model.add(MaxPooling1D(pool_size=2))
188
  model.add(Dropout(dropout_rate))
189
 
 
 
 
 
190
  model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
191
  model.add(MaxPooling1D(pool_size=2))
192
  model.add(Dropout(dropout_rate))
@@ -214,18 +161,9 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
214
  predicted_test = model.predict(testX).flatten() if testX is not None else None
215
 
216
  return predicted_train, predicted_test, history
217
-
218
- #-------------------------------------------------------------------------Random Forest----------------------------------------------------
219
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
220
- if feature_selection:
221
- rf=RandomForestRegressor(n_estimators=100, random_state=60)
222
- rf.fit(trainX, trainy)
223
- selector=SelectFromModel(rf, threshold="mean", prefit=True)
224
- trainX=selector.transform(trainX)
225
- if testX is not None:
226
- testX=selector.transform(testX)
227
- print(f"Selected {trainX.shape[1]} feature based on the feature selection")
228
-
229
 
230
  # Log transformation of the target variable
231
 
@@ -245,39 +183,25 @@ def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,featu
245
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
246
 
247
  return predicted_train, predicted_test,history
248
- #------------------------------------------------------------------------------XGboost---------------------------------------------------------------
249
  def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,feature_selection=True, n_estimators=100, max_depth=None):
250
- if feature_selection:
251
- rf=RandomForestRegressor(n_estimators=100,random_state=60)
252
- rf.fit(trainX,trainy)
253
- selector=SelectFromModel(rf,threshold="mean",prefit=True)
254
- trainX=selector.transform(trainX)
255
- if testX is not None:
256
- testX=selector.transform(testX)
257
- print(f"Selected {trainX.shape[1]} features based on feature importance")
258
-
259
 
260
- #trainy_log = np.log1p(trainy) # Log-transform to handle large phenotypic values
261
- #if testy is not None:
262
- # testy_log = np.log1p(testy)
263
-
264
  # Scale the features
265
  scaler = MinMaxScaler()
266
  trainX_scaled = scaler.fit_transform(trainX)
267
  if testX is not None:
268
  testX_scaled = scaler.transform(testX)
269
 
270
- # Define and train the XGBoost model
271
- # xgb_model = XGBRegressor(n_estimators=n_estimators, max_depth=100, random_state=42)
272
- #xgb_model = XGBRegressor(objective ='reg:linear',
273
- # n_estimators = 100, seed = 100)
274
- xgb_model=XGBRegressor(objective="reg:squarederror",random_state=60)
275
  history=xgb_model.fit(trainX, trainy)
276
  param_grid={
277
  "learning_rate":0.01,
278
  "max_depth" : 10,
279
  "n_estimators": 100,
280
- "min_child_weight": 5
281
  }
282
 
283
 
@@ -287,19 +211,7 @@ def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,fea
287
 
288
 
289
  return predicted_train, predicted_test,history
290
-
291
-
292
-
293
-
294
-
295
-
296
- #----------------------------------------reading file----------------------------------------------------------------------------------------
297
-
298
-
299
-
300
-
301
-
302
- # Helper function to read the uploaded CSV file
303
  def read_csv_file(uploaded_file):
304
  if uploaded_file is not None:
305
  if hasattr(uploaded_file, 'data'): # For NamedBytes
@@ -307,37 +219,34 @@ def read_csv_file(uploaded_file):
307
  elif hasattr(uploaded_file, 'name'): # For NamedString
308
  return pd.read_csv(uploaded_file.name)
309
  return None
 
 
 
 
310
 
 
 
 
311
 
 
 
 
312
 
 
 
313
 
314
- #--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
 
315
 
 
 
316
  def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
317
  training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
318
  outer_n_splits=2, output_file='cross_validation_results.csv',
319
  predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
320
 
321
  # Define calculate_topsis_score before using it
322
- def calculate_topsis_score(df):
323
- # Normalize the data
324
- norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
325
-
326
- # Calculate the positive and negative ideal solutions
327
- ideal_positive = norm_df.max(axis=0)
328
- ideal_negative = norm_df.min(axis=0)
329
-
330
- # Calculate the Euclidean distances
331
- dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
332
- dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
333
-
334
- # Calculate the TOPSIS score
335
- topsis_score = dist_negative / (dist_positive + dist_negative)
336
-
337
- # Add the TOPSIS score to the dataframe
338
- df['TOPSIS_Score'] = topsis_score
339
-
340
- return df
341
 
342
  # Original function logic continues here
343
  if 'phenotypes' not in training_data.columns:
@@ -360,20 +269,6 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
360
  training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
361
  testing_genotypic_data_merged = testing_data_merged.iloc[:, 1:].values
362
 
363
- # Feature selection
364
- if feature_selection:
365
- rf = RandomForestRegressor(n_estimators=100, random_state=60)
366
- rf.fit(training_genotypic_data_merged, phenotypic_info)
367
- selector = SelectFromModel(rf, threshold="mean", prefit=True)
368
- training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
369
- testing_genotypic_data_merged = selector.transform(testing_genotypic_data_merged)
370
- print(f"Selected {training_genotypic_data_merged.shape[1]} features based on importance.")
371
-
372
- # Standardize the genotypic data
373
- scaler = StandardScaler()
374
- training_genotypic_data_merged = scaler.fit_transform(training_genotypic_data_merged)
375
- testing_genotypic_data_merged = scaler.transform(testing_genotypic_data_merged)
376
-
377
  outer_kf = KFold(n_splits=outer_n_splits)
378
 
379
  results = []
@@ -397,7 +292,22 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
397
  outer_trainX = training_genotypic_data_merged[outer_train_index]
398
  outer_trainy = phenotypic_info[outer_train_index]
399
 
400
- outer_testX = testing_genotypic_data_merged
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  outer_testy = phenotypic_test_info
402
 
403
  for model_name, model_func in models:
@@ -468,18 +378,14 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
468
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
469
 
470
  return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
471
-
472
-
473
-
474
- #--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
475
-
476
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
477
  training_dominance_file, testing_dominance_file,feature_selection,learning_rate,min_child_weight):
478
 
479
  # Default parameters
480
  epochs = 1000
481
  batch_size = 64
482
- outer_n_splits = 10
483
  #inner_n_splits = 2
484
  min_child_weight=5
485
  learning_rate=0.001
@@ -552,3 +458,5 @@ with gr.Blocks() as interface:
552
 
553
  # Launch the interface
554
  interface.launch()
 
 
 
 
1
  """
2
+ Created on Tue Jan 28 13:43:25 2025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  @author: Ashmitha
5
  """
6
 
7
+ #---------------------------------------------Libraries--------------------------
8
  import pandas as pd
9
  import numpy as np
10
  import gradio as gr
 
27
  import io
28
  from sklearn.feature_selection import SelectFromModel
29
  import tempfile
30
+ #------------------------------------------GRUModel-------------------------------------
31
+ def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2):
32
+ """
33
+ GRU Model for regression tasks.
34
+
35
+ Args:
36
+ trainX (np.array): Training features of shape (samples, features).
37
+ trainy (np.array): Training target values of shape (samples,).
38
+ testX (np.array): Testing features of shape (samples, features).
39
+ testy (np.array): Testing target values of shape (samples,).
40
+ epochs (int): Number of epochs for training.
41
+ batch_size (int): Batch size for training.
42
+ learning_rate (float): Learning rate for the optimizer.
43
+ l1_reg (float): L1 regularization parameter.
44
+ l2_reg (float): L2 regularization parameter.
45
+ dropout_rate (float): Dropout rate for regularization.
46
+
47
+ Returns:
48
+ predicted_train (np.array): Predicted values for the training set.
49
+ predicted_test (np.array): Predicted values for the testing set.
50
+ history: Training history.
51
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Reshape trainX and testX to be 3D: (samples, timesteps, features)
53
+ trainX = trainX.reshape((trainX.shape[0], 1, trainX.shape[1])) # Adjusted for general feature count
54
  if testX is not None:
55
+ testX = testX.reshape((testX.shape[0], 1, testX.shape[1])) # Reshape testX if it exists
56
+
57
+ # Define the GRU model
58
  model = Sequential()
59
+
60
  # GRU Layer
61
+ model.add(GRU(512, input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=False,
62
+ kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
63
+
64
  # Dense Layers with Batch Normalization, Dropout, LeakyReLU
65
  model.add(Dense(256, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
66
  model.add(BatchNormalization())
 
71
  model.add(BatchNormalization())
72
  model.add(Dropout(dropout_rate))
73
  model.add(LeakyReLU(alpha=0.1))
74
+
75
  model.add(Dense(64, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
76
  model.add(BatchNormalization())
77
  model.add(Dropout(dropout_rate))
78
  model.add(LeakyReLU(alpha=0.1))
79
+
80
  model.add(Dense(32, kernel_initializer='he_normal', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
81
  model.add(BatchNormalization())
82
  model.add(Dropout(dropout_rate))
83
  model.add(LeakyReLU(alpha=0.1))
84
+
85
  # Output Layer with ReLU activation to prevent negative predictions
86
  model.add(Dense(1, activation="relu"))
87
+
88
  # Compile the model
89
  model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate), metrics=['mse'])
90
+
91
  # Callbacks for learning rate reduction and early stopping
92
  learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', patience=10, verbose=1, factor=0.5, min_lr=1e-6)
93
  early_stopping = EarlyStopping(monitor='val_loss', verbose=1, restore_best_weights=True, patience=10)
94
+
95
  # Train the model
96
+ history = model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1,
97
  callbacks=[learning_rate_reduction, early_stopping])
98
+
99
  # Predict train and test
100
  predicted_train = model.predict(trainX)
101
  predicted_test = model.predict(testX) if testX is not None else None
 
104
  predicted_train = predicted_train.flatten()
105
  if predicted_test is not None:
106
  predicted_test = predicted_test.flatten()
 
 
 
 
 
 
 
107
 
108
  return predicted_train, predicted_test, history
109
 
110
+ #--------------------------------------------------CNNModel-------------------------------------------
 
 
 
111
  def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
 
 
 
 
 
 
 
 
 
112
 
113
 
114
 
 
126
  model = Sequential()
127
 
128
  # Convolutional layers
129
+ model.add(Conv1D(512, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
130
  model.add(MaxPooling1D(pool_size=2))
131
  model.add(Dropout(dropout_rate))
132
 
133
+ model.add(Conv1D(256, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
134
+ model.add(MaxPooling1D(pool_size=2))
135
+ model.add(Dropout(dropout_rate))
136
+
137
  model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
138
  model.add(MaxPooling1D(pool_size=2))
139
  model.add(Dropout(dropout_rate))
 
161
  predicted_test = model.predict(testX).flatten() if testX is not None else None
162
 
163
  return predicted_train, predicted_test, history
164
+ #------------------------------------------RFModel---------------------------------------------------
 
165
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
166
+
 
 
 
 
 
 
 
 
167
 
168
  # Log transformation of the target variable
169
 
 
183
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
184
 
185
  return predicted_train, predicted_test,history
186
+ #-------------------------------------------------XGBoost--------------------------------------------
187
  def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,feature_selection=True, n_estimators=100, max_depth=None):
 
 
 
 
 
 
 
 
 
188
 
189
+
190
+
 
 
191
  # Scale the features
192
  scaler = MinMaxScaler()
193
  trainX_scaled = scaler.fit_transform(trainX)
194
  if testX is not None:
195
  testX_scaled = scaler.transform(testX)
196
 
197
+
198
+ xgb_model=XGBRegressor(objective="reg:squarederror",random_state=42)
 
 
 
199
  history=xgb_model.fit(trainX, trainy)
200
  param_grid={
201
  "learning_rate":0.01,
202
  "max_depth" : 10,
203
  "n_estimators": 100,
204
+ "min_child_weight": 10
205
  }
206
 
207
 
 
211
 
212
 
213
  return predicted_train, predicted_test,history
214
+ #------------------------------------------------------------------File--------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
215
  def read_csv_file(uploaded_file):
216
  if uploaded_file is not None:
217
  if hasattr(uploaded_file, 'data'): # For NamedBytes
 
219
  elif hasattr(uploaded_file, 'name'): # For NamedString
220
  return pd.read_csv(uploaded_file.name)
221
  return None
222
+ #------------------------------------------------------------Calculating TOPSIS score---------------------------
223
+ def calculate_topsis_score(df):
224
+ # Normalize the data
225
+ norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
226
 
227
+ # Calculate the positive and negative ideal solutions
228
+ ideal_positive = norm_df.max(axis=0)
229
+ ideal_negative = norm_df.min(axis=0)
230
 
231
+ # Calculate the Euclidean distances
232
+ dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
233
+ dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
234
 
235
+ # Calculate the TOPSIS score
236
+ topsis_score = dist_negative / (dist_positive + dist_negative)
237
 
238
+ # Add the TOPSIS score to the dataframe
239
+ df['TOPSIS_Score'] = topsis_score
240
 
241
+ return df
242
+ #_-------------------------------------------------------------NestedKFold Cross Validation---------------------
243
  def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
244
  training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
245
  outer_n_splits=2, output_file='cross_validation_results.csv',
246
  predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
247
 
248
  # Define calculate_topsis_score before using it
249
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  # Original function logic continues here
252
  if 'phenotypes' not in training_data.columns:
 
269
  training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
270
  testing_genotypic_data_merged = testing_data_merged.iloc[:, 1:].values
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  outer_kf = KFold(n_splits=outer_n_splits)
273
 
274
  results = []
 
292
  outer_trainX = training_genotypic_data_merged[outer_train_index]
293
  outer_trainy = phenotypic_info[outer_train_index]
294
 
295
+ # Feature selection (inside the outer loop to prevent data leakage)
296
+ if feature_selection:
297
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
298
+ rf.fit(outer_trainX, outer_trainy) # Fit only on outer_trainX
299
+ selector = SelectFromModel(rf, threshold="mean", prefit=True)
300
+ outer_trainX = selector.transform(outer_trainX)
301
+ testing_genotypic_data_merged_fold = selector.transform(testing_genotypic_data_merged) # Transform testing data
302
+ else:
303
+ testing_genotypic_data_merged_fold = testing_genotypic_data_merged
304
+
305
+ # Standardization (inside the outer loop to prevent data leakage)
306
+ scaler = StandardScaler()
307
+ outer_trainX = scaler.fit_transform(outer_trainX) # Fit and transform on outer_trainX
308
+ testing_genotypic_data_merged_fold = scaler.transform(testing_genotypic_data_merged_fold) # Transform testing data
309
+
310
+ outer_testX = testing_genotypic_data_merged_fold
311
  outer_testy = phenotypic_test_info
312
 
313
  for model_name, model_func in models:
 
378
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
379
 
380
  return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
381
+ #-------------------------------------------------------------------Gradio Interface----------------------------------
 
 
 
 
382
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
383
  training_dominance_file, testing_dominance_file,feature_selection,learning_rate,min_child_weight):
384
 
385
  # Default parameters
386
  epochs = 1000
387
  batch_size = 64
388
+ outer_n_splits = 2
389
  #inner_n_splits = 2
390
  min_child_weight=5
391
  learning_rate=0.001
 
458
 
459
  # Launch the interface
460
  interface.launch()
461
+
462
+