Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""
|
| 3 |
Created on Sun Nov 24 12:47:37 2024
|
|
@@ -55,22 +62,14 @@ def RandomForestFeatureSelection(trainX, trainy, num_features=60):
|
|
| 55 |
indices = np.argsort(importances)[-num_features:]
|
| 56 |
return indices
|
| 57 |
#----------------------------------------------------------GRU Model---------------------------------------------------------------------
|
| 58 |
-
|
| 59 |
-
from tensorflow.keras.models import Sequential
|
| 60 |
-
from tensorflow.keras.layers import GRU, Dense, BatchNormalization, Dropout, LeakyReLU
|
| 61 |
-
from tensorflow.keras.optimizers import Adam
|
| 62 |
-
from tensorflow.keras import regularizers
|
| 63 |
-
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
|
| 64 |
-
from sklearn.preprocessing import MinMaxScaler
|
| 65 |
-
from sklearn.ensemble import RandomForestRegressor
|
| 66 |
-
from sklearn.feature_selection import SelectFromModel
|
| 67 |
|
| 68 |
def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
|
| 69 |
|
| 70 |
# Apply feature selection using Random Forest Regressor
|
| 71 |
if feature_selection:
|
| 72 |
# Use RandomForestRegressor to rank features by importance
|
| 73 |
-
rf = RandomForestRegressor(n_estimators=100, random_state=
|
| 74 |
rf.fit(trainX, trainy)
|
| 75 |
|
| 76 |
# Select features with importance greater than a threshold (e.g., mean importance)
|
|
@@ -184,10 +183,6 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
|
|
| 184 |
model = Sequential()
|
| 185 |
|
| 186 |
# Convolutional layers
|
| 187 |
-
model.add(Conv1D(512, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 188 |
-
model.add(MaxPooling1D(pool_size=2))
|
| 189 |
-
model.add(Dropout(dropout_rate))
|
| 190 |
-
|
| 191 |
model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 192 |
model.add(MaxPooling1D(pool_size=2))
|
| 193 |
model.add(Dropout(dropout_rate))
|
|
@@ -195,14 +190,10 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
|
|
| 195 |
model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 196 |
model.add(MaxPooling1D(pool_size=2))
|
| 197 |
model.add(Dropout(dropout_rate))
|
| 198 |
-
|
| 199 |
-
model.add(Conv1D(64, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 200 |
-
model.add(MaxPooling1D(pool_size=2))
|
| 201 |
-
model.add(Dropout(dropout_rate))
|
| 202 |
|
| 203 |
# Flatten and Dense layers
|
| 204 |
model.add(Flatten())
|
| 205 |
-
model.add(Dense(
|
| 206 |
model.add(LeakyReLU(alpha=0.1))
|
| 207 |
model.add(Dropout(dropout_rate))
|
| 208 |
|
|
@@ -320,45 +311,57 @@ def read_csv_file(uploaded_file):
|
|
| 320 |
|
| 321 |
#-----------------------------------------------------------------calculate topsis score--------------------------------------------------------
|
| 322 |
|
| 323 |
-
|
| 324 |
def calculate_topsis_score(df):
|
| 325 |
-
# Normalize the
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
# Calculate Euclidean distance to ideal best and worst
|
| 344 |
-
dist_to_best = np.sqrt(((norm_metrics - ideal_best) ** 2).sum(axis=1))
|
| 345 |
-
dist_to_worst = np.sqrt(((norm_metrics - ideal_worst) ** 2).sum(axis=1))
|
| 346 |
-
|
| 347 |
-
# Calculate TOPSIS score
|
| 348 |
-
topsis_score = dist_to_worst / (dist_to_best + dist_to_worst)
|
| 349 |
-
df['TOPSIS_Score'] = np.nan # Initialize with NaN
|
| 350 |
-
df.loc[metrics.index, 'TOPSIS_Score'] = topsis_score # Assign TOPSIS scores
|
| 351 |
return df
|
|
|
|
|
|
|
| 352 |
|
| 353 |
#--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
|
| 354 |
|
| 355 |
-
def NestedKFoldCrossValidation(
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
|
|
|
| 362 |
if 'phenotypes' not in training_data.columns:
|
| 363 |
raise ValueError("Training data does not contain the 'phenotypes' column.")
|
| 364 |
|
|
@@ -381,7 +384,7 @@ def NestedKFoldCrossValidation(
|
|
| 381 |
|
| 382 |
# Feature selection
|
| 383 |
if feature_selection:
|
| 384 |
-
rf = RandomForestRegressor(n_estimators=
|
| 385 |
rf.fit(training_genotypic_data_merged, phenotypic_info)
|
| 386 |
selector = SelectFromModel(rf, threshold="mean", prefit=True)
|
| 387 |
training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
|
|
@@ -455,44 +458,33 @@ def NestedKFoldCrossValidation(
|
|
| 455 |
|
| 456 |
# Compile results
|
| 457 |
results_df = pd.DataFrame(results)
|
| 458 |
-
avg_results_df = results_df.groupby('Model').agg({
|
| 459 |
-
'Train_MSE': 'mean',
|
| 460 |
-
'Train_RMSE': 'mean',
|
| 461 |
-
'Train_R2': 'mean',
|
| 462 |
-
'Train_Corr': 'mean',
|
| 463 |
-
'Test_MSE': 'mean',
|
| 464 |
-
'Test_RMSE': 'mean',
|
| 465 |
-
'Test_R2': 'mean',
|
| 466 |
-
'Test_Corr': 'mean'
|
| 467 |
-
}).reset_index()
|
| 468 |
-
|
| 469 |
-
# Calculate the TOPSIS score for the average metrics (considering only MSE, RMSE, R², and Correlation)
|
| 470 |
-
def calculate_topsis_score(df):
|
| 471 |
-
# Normalize the data
|
| 472 |
-
norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
|
| 473 |
-
|
| 474 |
-
# Calculate the positive and negative ideal solutions
|
| 475 |
-
ideal_positive = norm_df.max(axis=0)
|
| 476 |
-
ideal_negative = norm_df.min(axis=0)
|
| 477 |
-
|
| 478 |
-
# Calculate the Euclidean distances
|
| 479 |
-
dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
|
| 480 |
-
dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
|
| 481 |
-
|
| 482 |
-
# Calculate the TOPSIS score
|
| 483 |
-
topsis_score = dist_negative / (dist_positive + dist_negative)
|
| 484 |
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
avg_results_df = calculate_topsis_score(avg_results_df)
|
| 491 |
|
| 492 |
# Save the results with TOPSIS scores to the file
|
| 493 |
avg_results_df.to_csv(output_file, index=False)
|
| 494 |
|
| 495 |
-
# Save predicted phenotypes
|
| 496 |
if all_predicted_phenotypes:
|
| 497 |
predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
|
| 498 |
predicted_all_df.to_csv(predicted_phenotype_file, index=False)
|
|
@@ -500,15 +492,6 @@ def NestedKFoldCrossValidation(
|
|
| 500 |
return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
|
| 501 |
|
| 502 |
|
| 503 |
-
# Save the results to the file
|
| 504 |
-
#results_df.to_csv(output_file, index=False)
|
| 505 |
-
|
| 506 |
-
# Save predicted phenotypes
|
| 507 |
-
#if all_predicted_phenotypes:
|
| 508 |
-
# predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
|
| 509 |
-
#predicted_all_df.to_csv(predicted_phenotype_file, index=False)
|
| 510 |
-
|
| 511 |
-
# return results_df, predicted_all_df if all_predicted_phenotypes else None
|
| 512 |
|
| 513 |
#--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
|
| 514 |
|
|
@@ -519,7 +502,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
|
|
| 519 |
epochs = 1000
|
| 520 |
batch_size = 64
|
| 521 |
outer_n_splits = 2
|
| 522 |
-
inner_n_splits = 2
|
| 523 |
min_child_weight=5
|
| 524 |
learning_rate=0.001
|
| 525 |
#learning_rate=learning_rate
|
|
@@ -543,7 +526,7 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
|
|
| 543 |
testing_dominance=testing_dominance,
|
| 544 |
epochs=epochs,
|
| 545 |
batch_size=batch_size,
|
| 546 |
-
|
| 547 |
#inner_n_splits=inner_n_splits,
|
| 548 |
learning_rate=learning_rate,
|
| 549 |
min_child_weight=min_child_weight,
|
|
@@ -591,6 +574,3 @@ with gr.Blocks() as interface:
|
|
| 591 |
|
| 592 |
# Launch the interface
|
| 593 |
interface.launch()
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Created on Tue Jan 28 09:12:48 2025
|
| 4 |
+
|
| 5 |
+
@author: Ashmitha
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
# -*- coding: utf-8 -*-
|
| 9 |
"""
|
| 10 |
Created on Sun Nov 24 12:47:37 2024
|
|
|
|
| 62 |
indices = np.argsort(importances)[-num_features:]
|
| 63 |
return indices
|
| 64 |
#----------------------------------------------------------GRU Model---------------------------------------------------------------------
|
| 65 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def GRUModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2, feature_selection=True):
|
| 68 |
|
| 69 |
# Apply feature selection using Random Forest Regressor
|
| 70 |
if feature_selection:
|
| 71 |
# Use RandomForestRegressor to rank features by importance
|
| 72 |
+
rf = RandomForestRegressor(n_estimators=100, random_state=42)
|
| 73 |
rf.fit(trainX, trainy)
|
| 74 |
|
| 75 |
# Select features with importance greater than a threshold (e.g., mean importance)
|
|
|
|
| 183 |
model = Sequential()
|
| 184 |
|
| 185 |
# Convolutional layers
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
model.add(Conv1D(256, kernel_size=3, activation='relu', input_shape=(trainX.shape[1], 1), kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 187 |
model.add(MaxPooling1D(pool_size=2))
|
| 188 |
model.add(Dropout(dropout_rate))
|
|
|
|
| 190 |
model.add(Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 191 |
model.add(MaxPooling1D(pool_size=2))
|
| 192 |
model.add(Dropout(dropout_rate))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
# Flatten and Dense layers
|
| 195 |
model.add(Flatten())
|
| 196 |
+
model.add(Dense(64, kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)))
|
| 197 |
model.add(LeakyReLU(alpha=0.1))
|
| 198 |
model.add(Dropout(dropout_rate))
|
| 199 |
|
|
|
|
| 311 |
|
| 312 |
#-----------------------------------------------------------------calculate topsis score--------------------------------------------------------
|
| 313 |
|
|
|
|
| 314 |
def calculate_topsis_score(df):
|
| 315 |
+
# Normalize the data
|
| 316 |
+
norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
|
| 317 |
+
|
| 318 |
+
# Calculate the positive and negative ideal solutions
|
| 319 |
+
ideal_positive = norm_df.max(axis=0)
|
| 320 |
+
ideal_negative = norm_df.min(axis=0)
|
| 321 |
+
|
| 322 |
+
# Calculate the Euclidean distances
|
| 323 |
+
dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
|
| 324 |
+
dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
|
| 325 |
+
|
| 326 |
+
# Calculate the TOPSIS score
|
| 327 |
+
topsis_score = dist_negative / (dist_positive + dist_negative)
|
| 328 |
+
|
| 329 |
+
# Add the TOPSIS score to the dataframe
|
| 330 |
+
df['TOPSIS_Score'] = topsis_score
|
| 331 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
return df
|
| 333 |
+
# Calculate the TOPSIS score for the average metrics
|
| 334 |
+
|
| 335 |
|
| 336 |
#--------------------------------------------------- Nested Cross validation---------------------------------------------------------------------------
|
| 337 |
|
| 338 |
+
def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
|
| 339 |
+
training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
|
| 340 |
+
outer_n_splits=2, output_file='cross_validation_results.csv',
|
| 341 |
+
predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
|
| 342 |
+
|
| 343 |
+
# Define calculate_topsis_score before using it
|
| 344 |
+
def calculate_topsis_score(df):
|
| 345 |
+
# Normalize the data
|
| 346 |
+
norm_df = (df.iloc[:, 1:] - df.iloc[:, 1:].min()) / (df.iloc[:, 1:].max() - df.iloc[:, 1:].min())
|
| 347 |
+
|
| 348 |
+
# Calculate the positive and negative ideal solutions
|
| 349 |
+
ideal_positive = norm_df.max(axis=0)
|
| 350 |
+
ideal_negative = norm_df.min(axis=0)
|
| 351 |
+
|
| 352 |
+
# Calculate the Euclidean distances
|
| 353 |
+
dist_positive = np.sqrt(((norm_df - ideal_positive) ** 2).sum(axis=1))
|
| 354 |
+
dist_negative = np.sqrt(((norm_df - ideal_negative) ** 2).sum(axis=1))
|
| 355 |
+
|
| 356 |
+
# Calculate the TOPSIS score
|
| 357 |
+
topsis_score = dist_negative / (dist_positive + dist_negative)
|
| 358 |
+
|
| 359 |
+
# Add the TOPSIS score to the dataframe
|
| 360 |
+
df['TOPSIS_Score'] = topsis_score
|
| 361 |
+
|
| 362 |
+
return df
|
| 363 |
|
| 364 |
+
# Original function logic continues here
|
| 365 |
if 'phenotypes' not in training_data.columns:
|
| 366 |
raise ValueError("Training data does not contain the 'phenotypes' column.")
|
| 367 |
|
|
|
|
| 384 |
|
| 385 |
# Feature selection
|
| 386 |
if feature_selection:
|
| 387 |
+
rf = RandomForestRegressor(n_estimators=100, random_state=60)
|
| 388 |
rf.fit(training_genotypic_data_merged, phenotypic_info)
|
| 389 |
selector = SelectFromModel(rf, threshold="mean", prefit=True)
|
| 390 |
training_genotypic_data_merged = selector.transform(training_genotypic_data_merged)
|
|
|
|
| 458 |
|
| 459 |
# Compile results
|
| 460 |
results_df = pd.DataFrame(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
+
# Calculate the average metrics for each model
|
| 463 |
+
if 'phenotypes' in testing_data.columns:
|
| 464 |
+
avg_results_df = results_df.groupby('Model').agg({
|
| 465 |
+
'Train_MSE': 'mean',
|
| 466 |
+
'Train_RMSE': 'mean',
|
| 467 |
+
'Train_R2': 'mean',
|
| 468 |
+
'Train_Corr': 'mean',
|
| 469 |
+
'Test_MSE': 'mean',
|
| 470 |
+
'Test_RMSE': 'mean',
|
| 471 |
+
'Test_R2': 'mean',
|
| 472 |
+
'Test_Corr': 'mean'
|
| 473 |
+
}).reset_index()
|
| 474 |
+
else:
|
| 475 |
+
avg_results_df = results_df.groupby('Model').agg({
|
| 476 |
+
'Train_MSE': 'mean',
|
| 477 |
+
'Train_RMSE': 'mean',
|
| 478 |
+
'Train_R2': 'mean',
|
| 479 |
+
'Train_Corr': 'mean'
|
| 480 |
+
}).reset_index()
|
| 481 |
|
| 482 |
avg_results_df = calculate_topsis_score(avg_results_df)
|
| 483 |
|
| 484 |
# Save the results with TOPSIS scores to the file
|
| 485 |
avg_results_df.to_csv(output_file, index=False)
|
| 486 |
|
| 487 |
+
# Save predicted phenotypes
|
| 488 |
if all_predicted_phenotypes:
|
| 489 |
predicted_all_df = pd.concat(all_predicted_phenotypes, axis=0, ignore_index=True)
|
| 490 |
predicted_all_df.to_csv(predicted_phenotype_file, index=False)
|
|
|
|
| 492 |
return avg_results_df, predicted_all_df if all_predicted_phenotypes else None
|
| 493 |
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
#--------------------------------------------------------------------Gradio interface---------------------------------------------------------------
|
| 497 |
|
|
|
|
| 502 |
epochs = 1000
|
| 503 |
batch_size = 64
|
| 504 |
outer_n_splits = 2
|
| 505 |
+
#inner_n_splits = 2
|
| 506 |
min_child_weight=5
|
| 507 |
learning_rate=0.001
|
| 508 |
#learning_rate=learning_rate
|
|
|
|
| 526 |
testing_dominance=testing_dominance,
|
| 527 |
epochs=epochs,
|
| 528 |
batch_size=batch_size,
|
| 529 |
+
outer_n_splits=outer_n_splits,
|
| 530 |
#inner_n_splits=inner_n_splits,
|
| 531 |
learning_rate=learning_rate,
|
| 532 |
min_child_weight=min_child_weight,
|
|
|
|
| 574 |
|
| 575 |
# Launch the interface
|
| 576 |
interface.launch()
|
|
|
|
|
|
|
|
|