AshmithaIRRI commited on
Commit
2faf8d0
·
verified ·
1 Parent(s): be50c9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -98
app.py CHANGED
@@ -1,3 +1,10 @@
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """
3
  Created on Sun Nov 24 12:47:37 2024
@@ -55,22 +62,14 @@ def RandomForestFeatureSelection(trainX, trainy, num_features=60):
55
  indices = np.argsort(importances)[-num_features:]
56
  return indices
57
  #----------------------------------------------------------GRU Model---------------------------------------------------------------------
58
- import numpy as np
59
- from tensorflow.keras.models import Sequential
60
- from tensorflow.keras.layers import GRU, Dense, BatchNormalization, Dropout, LeakyReLU
61
- from tensorflow.keras.optimizers import Adam
62
- from tensorflow.keras import regularizers
63
- from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
64
- from sklearn.preprocessing import MinMaxScaler
65
- from sklearn.ensemble import RandomForestRegressor
66
- from sklearn.feature_selection import SelectFromModel
67
 
68
  def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
69
 
70
  # Apply feature selection using Random Forest Regressor
71
  if feature_selection:
72
  # Use RandomForestRegressor to rank features by importance
73
- rf = RandomForestRegressor(n_estimators=100, random_state=60)
74
  rf.fit(trainX, trainy)
75
 
76
  # Select features with importance greater than a threshold (e.g., mean importance)
@@ -184,10 +183,6 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
184
  model = Sequential()
185
 
186
  # Convolutional layers
187
- model.add(Conv1D(512, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
188
- model.add(MaxPooling1D(pool_size=2))
189
- model.add(Dropout(dropout_rate))
190
-
191
  model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
192
  model.add(MaxPooling1D(pool_size=2))
193
  model.add(Dropout(dropout_rate))
@@ -195,14 +190,10 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
195
  model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
196
  model.add(MaxPooling1D(pool_size=2))
197
  model.add(Dropout(dropout_rate))
198
-
199
- model.add(Conv1D(64, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
200
- model.add(MaxPooling1D(pool_size=2))
201
- model.add(Dropout(dropout_rate))
202
 
203
  # Flatten and Dense layers
204
  model.add(Flatten())
205
- model.add(Dense(32, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
206
  model.add(LeakyReLU(alpha=0.1))
207
  model.add(Dropout(dropout_rate))
208
 
@@ -320,45 +311,57 @@ def read_csv_file(uploaded_file):
320
 
321
  #-----------------------------------------------------------------calculate topsis score--------------------------------------------------------
322
 
323
-
324
  def calculate_topsis_score(df):
325
- # Normalize the metrics
326
- metrics = df[['Train_MSE', 'Train_RMSE', 'Train_R2', 'Train_Corr']].dropna() # Ensure no NaN values
327
- norm_metrics = metrics / np.sqrt((metrics ** 2).sum(axis=0))
328
-
329
- # Define ideal best and worst for each metric
330
- ideal_best = pd.Series(index=norm_metrics.columns)
331
- ideal_worst = pd.Series(index=norm_metrics.columns)
332
-
333
- # For RMSE and MSE (minimization criteria): min is best, max is worst
334
- for col in ['Train_MSE', 'Train_RMSE']:
335
- ideal_best[col] = norm_metrics[col].min()
336
- ideal_worst[col] = norm_metrics[col].max()
337
-
338
- # For R2 and Corr (maximization criteria): max is best, min is worst
339
- for col in ['Train_R2', 'Train_Corr']:
340
- ideal_best[col] = norm_metrics[col].max()
341
- ideal_worst[col] = norm_metrics[col].min()
342
-
343
- # Calculate Euclidean distance to ideal best and worst
344
- dist_to_best = np.sqrt(((norm_metrics - ideal_best) ** 2).sum(axis=1))
345
- dist_to_worst = np.sqrt(((norm_metrics - ideal_worst) ** 2).sum(axis=1))
346
-
347
- # Calculate TOPSIS score
348
- topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)
349
- df['TOPSIS_Score'] = np.nan # Initialize with NaN
350
- df.loc[metrics.index, 'TOPSIS_Score'] = topsis_score # Assign TOPSIS scores
351
  return df
 
 
352
 
353
  #--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
354
 
355
- def NestedKFoldCrossValidation(
356
- training_data, training_additive, testing_data, testing_additive,
357
- training_dominance, testing_dominance, epochs, learning_rate, min_child_weight,
358
- batch_size=64, outer_n_splits=2, output_file='cross_validation_results.csv',
359
- predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True
360
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
 
362
  if 'phenotypes' not in training_data.columns:
363
  raise ValueError("Training data does not contain the 'phenotypes' column.")
364
 
@@ -381,7 +384,7 @@ def NestedKFoldCrossValidation(
381
 
382
  # Feature selection
383
  if feature_selection:
384
- rf = RandomForestRegressor(n_estimators=1000, random_state=60)
385
  rf.fit(training_genotypic_data_merged, phenotypic_info)
386
  selector = SelectFromModel(rf, threshold="mean", prefit=True)
387
  training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
@@ -455,44 +458,33 @@ def NestedKFoldCrossValidation(
455
 
456
  # Compile results
457
  results_df = pd.DataFrame(results)
458
- avg_results_df = results_df.groupby('Model').agg({
459
- 'Train_MSE': 'mean',
460
- 'Train_RMSE': 'mean',
461
- 'Train_R2': 'mean',
462
- 'Train_Corr': 'mean',
463
- 'Test_MSE': 'mean',
464
- 'Test_RMSE': 'mean',
465
- 'Test_R2': 'mean',
466
- 'Test_Corr': 'mean'
467
- }).reset_index()
468
-
469
- # Calculate the TOPSIS score for the average metrics (considering only MSE, RMSE, R², and Correlation)
470
- def calculate_topsis_score(df):
471
- # Normalize the data
472
- norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
473
-
474
- # Calculate the positive and negative ideal solutions
475
- ideal_positive = norm_df.max(axis=0)
476
- ideal_negative = norm_df.min(axis=0)
477
-
478
- # Calculate the Euclidean distances
479
- dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
480
- dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
481
-
482
- # Calculate the TOPSIS score
483
- topsis_score = dist_negative / (dist_positive + dist_negative)
484
 
485
- # Add the TOPSIS score to the dataframe
486
- df['TOPSIS_Score'] = topsis_score
487
-
488
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
 
490
  avg_results_df = calculate_topsis_score(avg_results_df)
491
 
492
  # Save the results with TOPSIS scores to the file
493
  avg_results_df.to_csv(output_file, index=False)
494
 
495
- # Save predicted phenotypes
496
  if all_predicted_phenotypes:
497
  predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
498
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
@@ -500,15 +492,6 @@ def NestedKFoldCrossValidation(
500
  return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
501
 
502
 
503
- # Save the results to the file
504
- #results_df.to_csv(output_file, index=False)
505
-
506
- # Save predicted phenotypes
507
- #if all_predicted_phenotypes:
508
- # predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
509
- #predicted_all_df.to_csv(predicted_phenotype_file, index=False)
510
-
511
- # return results_df, predicted_all_df if all_predicted_phenotypes else None
512
 
513
  #--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
514
 
@@ -519,7 +502,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
519
  epochs = 1000
520
  batch_size = 64
521
  outer_n_splits = 2
522
- inner_n_splits = 2
523
  min_child_weight=5
524
  learning_rate=0.001
525
  #learning_rate=learning_rate
@@ -543,7 +526,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
543
  testing_dominance=testing_dominance,
544
  epochs=epochs,
545
  batch_size=batch_size,
546
- #outer_n_splits=outer_n_splits,
547
  #inner_n_splits=inner_n_splits,
548
  learning_rate=learning_rate,
549
  min_child_weight=min_child_weight,
@@ -591,6 +574,3 @@ with gr.Blocks() as interface:
591
 
592
  # Launch the interface
593
  interface.launch()
594
-
595
-
596
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Jan 28 09:12:48 2025
4
+
5
+ @author: Ashmitha
6
+ """
7
+
8
  # -*- coding: utf-8 -*-
9
  """
10
  Created on Sun Nov 24 12:47:37 2024
 
62
  indices = np.argsort(importances)[-num_features:]
63
  return indices
64
  #----------------------------------------------------------GRU Model---------------------------------------------------------------------
65
+
 
 
 
 
 
 
 
 
66
 
67
  def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
68
 
69
  # Apply feature selection using Random Forest Regressor
70
  if feature_selection:
71
  # Use RandomForestRegressor to rank features by importance
72
+ rf = RandomForestRegressor(n_estimators=100, random_state=42)
73
  rf.fit(trainX, trainy)
74
 
75
  # Select features with importance greater than a threshold (e.g., mean importance)
 
183
  model = Sequential()
184
 
185
  # Convolutional layers
 
 
 
 
186
  model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
187
  model.add(MaxPooling1D(pool_size=2))
188
  model.add(Dropout(dropout_rate))
 
190
  model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
191
  model.add(MaxPooling1D(pool_size=2))
192
  model.add(Dropout(dropout_rate))
 
 
 
 
193
 
194
  # Flatten and Dense layers
195
  model.add(Flatten())
196
+ model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
197
  model.add(LeakyReLU(alpha=0.1))
198
  model.add(Dropout(dropout_rate))
199
 
 
311
 
312
  #-----------------------------------------------------------------calculate topsis score--------------------------------------------------------
313
 
 
314
  def calculate_topsis_score(df):
315
+ # Normalize the data
316
+ norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
317
+
318
+ # Calculate the positive and negative ideal solutions
319
+ ideal_positive = norm_df.max(axis=0)
320
+ ideal_negative = norm_df.min(axis=0)
321
+
322
+ # Calculate the Euclidean distances
323
+ dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
324
+ dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
325
+
326
+ # Calculate the TOPSIS score
327
+ topsis_score = dist_negative / (dist_positive + dist_negative)
328
+
329
+ # Add the TOPSIS score to the dataframe
330
+ df['TOPSIS_Score'] = topsis_score
331
+
 
 
 
 
 
 
 
 
 
332
  return df
333
+ # Calculate the TOPSIS score for the average metrics
334
+
335
 
336
  #--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
337
 
338
+ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
339
+ training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
340
+ outer_n_splits=2, output_file='cross_validation_results.csv',
341
+ predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
342
+
343
+ # Define calculate_topsis_score before using it
344
+ def calculate_topsis_score(df):
345
+ # Normalize the data
346
+ norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
347
+
348
+ # Calculate the positive and negative ideal solutions
349
+ ideal_positive = norm_df.max(axis=0)
350
+ ideal_negative = norm_df.min(axis=0)
351
+
352
+ # Calculate the Euclidean distances
353
+ dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
354
+ dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
355
+
356
+ # Calculate the TOPSIS score
357
+ topsis_score = dist_negative / (dist_positive + dist_negative)
358
+
359
+ # Add the TOPSIS score to the dataframe
360
+ df['TOPSIS_Score'] = topsis_score
361
+
362
+ return df
363
 
364
+ # Original function logic continues here
365
  if 'phenotypes' not in training_data.columns:
366
  raise ValueError("Training data does not contain the 'phenotypes' column.")
367
 
 
384
 
385
  # Feature selection
386
  if feature_selection:
387
+ rf = RandomForestRegressor(n_estimators=100, random_state=60)
388
  rf.fit(training_genotypic_data_merged, phenotypic_info)
389
  selector = SelectFromModel(rf, threshold="mean", prefit=True)
390
  training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
 
458
 
459
  # Compile results
460
  results_df = pd.DataFrame(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
 
462
+ # Calculate the average metrics for each model
463
+ if 'phenotypes' in testing_data.columns:
464
+ avg_results_df = results_df.groupby('Model').agg({
465
+ 'Train_MSE': 'mean',
466
+ 'Train_RMSE': 'mean',
467
+ 'Train_R2': 'mean',
468
+ 'Train_Corr': 'mean',
469
+ 'Test_MSE': 'mean',
470
+ 'Test_RMSE': 'mean',
471
+ 'Test_R2': 'mean',
472
+ 'Test_Corr': 'mean'
473
+ }).reset_index()
474
+ else:
475
+ avg_results_df = results_df.groupby('Model').agg({
476
+ 'Train_MSE': 'mean',
477
+ 'Train_RMSE': 'mean',
478
+ 'Train_R2': 'mean',
479
+ 'Train_Corr': 'mean'
480
+ }).reset_index()
481
 
482
  avg_results_df = calculate_topsis_score(avg_results_df)
483
 
484
  # Save the results with TOPSIS scores to the file
485
  avg_results_df.to_csv(output_file, index=False)
486
 
487
+ # Save predicted phenotypes
488
  if all_predicted_phenotypes:
489
  predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
490
  predicted_all_df.to_csv(predicted_phenotype_file, index=False)
 
492
  return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
493
 
494
 
 
 
 
 
 
 
 
 
 
495
 
496
  #--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
497
 
 
502
  epochs = 1000
503
  batch_size = 64
504
  outer_n_splits = 2
505
+ #inner_n_splits = 2
506
  min_child_weight=5
507
  learning_rate=0.001
508
  #learning_rate=learning_rate
 
526
  testing_dominance=testing_dominance,
527
  epochs=epochs,
528
  batch_size=batch_size,
529
+ outer_n_splits=outer_n_splits,
530
  #inner_n_splits=inner_n_splits,
531
  learning_rate=learning_rate,
532
  min_child_weight=min_child_weight,
 
574
 
575
  # Launch the interface
576
  interface.launch()