Spectral Data Analysis Example Code
Code Example 1
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
# Load your dataset
df = pd.read_csv("my_dataset.csv")
# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]
# Initialise list to store R^2 scores
r2_scores = []
# Initialise list to store RMSE values
rmse_scores = []
# Loop over 12 different seeds to create different train/test splits, splitting 10% into the test dataset
for seed in range(12): # Adjust for desired number of seeds
X_train, X_test, y_train, y_test = train_test_split(df[x_columns], df[y_column], test_size=0.1, random_state=seed)
# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components
# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit the GridSearchCV object to the data.
grid_search.fit(X_train, y_train)
# Print out the best parameters
print(f"Best parameters found by grid search for seed {seed}:")
print(grid_search.best_params_)
# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(X_train, y_train)
# Make predictions on the test dataset
y_pred = model.predict(X_test)
# Calculate R^2 score for the mean prediction
r2 = r2_score(y_test, y_pred)
print(f"R2 score for seed {seed}: {r2}")
# Calculate RMSE for the mean prediction
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for seed {seed}: {rmse}")
# Plot graph of actual vs predicted values for the model
plt.scatter(y_test, y_pred)
plt.title("PLSR Model")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
r2_scores.append(r2)
rmse_scores.append(rmse)
mean_r2_score = np.mean(r2_scores)
std_deviation_r2_score = np.std(r2_scores)
print(f"Mean R2 score across all seeds: {mean_r2_score}")
print(f"Standard deviation of R2 score across all seeds: {std_deviation_r2_score}")
mean_rmse_score = np.mean(rmse_scores)
std_deviation_rmse_score = np.std(rmse_scores)
print(f"Mean RMSE score across all seeds: {mean_rmse_score}")
print(f"Standard deviation of RMSE score across all seeds: {std_deviation_rmse_score}")
Code Example 2
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
# Load your training dataset
df = pd.read_csv("my_dataset.csv")
# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]
# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components
# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit the GridSearchCV object to the data.
grid_search.fit(df[x_columns], df[y_column])
# Print out the best parameters
print(f"Best parameters found by grid search:")
print(grid_search.best_params_)
# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(df[x_columns], df[y_column])
# From this point onwards, the model can now be used to predict new data
# Load your new dataset for predicting traits
Samples = pd.read_csv("field_collection_1.csv")
# Make predictions on the new dataset
y_pred = model.predict(Samples)
# Print the predicted values
r2 = r2_score(y_test, y_pred)
df_predictions = pd.DataFrame(y_samples_pred, columns=["Predicted_Value"])
df_predictions.to_csv("predicted_values.csv", index=False)
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
# Load your dataset
df = pd.read_csv("my_dataset.csv")
# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]
# Initialise list to store R^2 scores
r2_scores = []
# Initialise list to store RMSE values
rmse_scores = []
# Loop over 12 different seeds to create different train/test splits, splitting 10% into the test dataset
for seed in range(12): # Adjust for desired number of seeds
X_train, X_test, y_train, y_test = train_test_split(df[x_columns], df[y_column], test_size=0.1, random_state=seed)
# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components
# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit the GridSearchCV object to the data.
grid_search.fit(X_train, y_train)
# Print out the best parameters
print(f"Best parameters found by grid search for seed {seed}:")
print(grid_search.best_params_)
# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(X_train, y_train)
# Make predictions on the test dataset
y_pred = model.predict(X_test)
# Calculate R^2 score for the mean prediction
r2 = r2_score(y_test, y_pred)
print(f"R2 score for seed {seed}: {r2}")
# Calculate RMSE for the mean prediction
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for seed {seed}: {rmse}")
# Plot graph of actual vs predicted values for the model
plt.scatter(y_test, y_pred)
plt.title("PLSR Model")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
r2_scores.append(r2)
rmse_scores.append(rmse)
mean_r2_score = np.mean(r2_scores)
std_deviation_r2_score = np.std(r2_scores)
print(f"Mean R2 score across all seeds: {mean_r2_score}")
print(f"Standard deviation of R2 score across all seeds: {std_deviation_r2_score}")
mean_rmse_score = np.mean(rmse_scores)
std_deviation_rmse_score = np.std(rmse_scores)
print(f"Mean RMSE score across all seeds: {mean_rmse_score}")
print(f"Standard deviation of RMSE score across all seeds: {std_deviation_rmse_score}")
Code Example 2
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
# Load your training dataset
df = pd.read_csv("my_dataset.csv")
# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]
# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components
# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit the GridSearchCV object to the data.
grid_search.fit(df[x_columns], df[y_column])
# Print out the best parameters
print(f"Best parameters found by grid search:")
print(grid_search.best_params_)
# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(df[x_columns], df[y_column])
# From this point onwards, the model can now be used to predict new data
# Load your new dataset for predicting traits
Samples = pd.read_csv("field_collection_1.csv")
# Make predictions on the new dataset
y_pred = model.predict(Samples)
# Print the predicted values
r2 = r2_score(y_test, y_pred)
df_predictions = pd.DataFrame(y_samples_pred, columns=["Predicted_Value"])
df_predictions.to_csv("predicted_values.csv", index=False)