Spectral Data Analysis Example Code

Spectral Data Analysis Example Code

Code Example 1

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Load your dataset
df = pd.read_csv("my_dataset.csv")

# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]

# Initialise list to store R^2 scores
r2_scores = []

# Initialise list to store RMSE values
rmse_scores = []

# Loop over 12 different seeds to create different train/test splits, splitting 10% into the test dataset
for seed in range(12): # Adjust for desired number of seeds
X_train, X_test, y_train, y_test = train_test_split(df[x_columns], df[y_column], test_size=0.1, random_state=seed)

# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components

# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data.
grid_search.fit(X_train, y_train)

# Print out the best parameters
print(f"Best parameters found by grid search for seed {seed}:")
print(grid_search.best_params_)

# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = model.predict(X_test)

# Calculate R^2 score for the mean prediction
r2 = r2_score(y_test, y_pred)
print(f"R2 score for seed {seed}: {r2}")

# Calculate RMSE for the mean prediction
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for seed {seed}: {rmse}")

# Plot graph of actual vs predicted values for the model
plt.scatter(y_test, y_pred)
plt.title("PLSR Model")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()

r2_scores.append(r2)
rmse_scores.append(rmse)

mean_r2_score = np.mean(r2_scores)
std_deviation_r2_score = np.std(r2_scores)

print(f"Mean R2 score across all seeds: {mean_r2_score}")
print(f"Standard deviation of R2 score across all seeds: {std_deviation_r2_score}")

mean_rmse_score = np.mean(rmse_scores)
std_deviation_rmse_score = np.std(rmse_scores)

print(f"Mean RMSE score across all seeds: {mean_rmse_score}")
print(f"Standard deviation of RMSE score across all seeds: {std_deviation_rmse_score}")

Code Example 2

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Load your training dataset
df = pd.read_csv("my_dataset.csv")

# Assuming the Y values are in the first column and the remaining columns are the reflectance data (e.g. wavelengths 400nm to 1900m,)
y_column = df.columns[0]
x_columns = df.columns[1:]

# Define the number of components from which you wish to find the optimal component number
param_grid = {'n_components': list(range(1,30))} # Adjust as needed, currently from 1 to 30 components

# Initialize a GridSearchCV object that will perform the cross-validation, using 5 k-folds, to find the optimum number of components
grid_search = GridSearchCV(PLSRegression(scale=True), param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data.
grid_search.fit(df[x_columns], df[y_column])

# Print out the best parameters
print(f"Best parameters found by grid search:")
print(grid_search.best_params_)

# Train model on the entire data set using the best parameters
model = PLSRegression(**grid_search.best_params_)
model.fit(df[x_columns], df[y_column])

# From this point onwards, the model can now be used to predict new data

# Load your new dataset for predicting traits
Samples = pd.read_csv("field_collection_1.csv")

# Make predictions on the new dataset
y_pred = model.predict(Samples)

# Print the predicted values
r2 = r2_score(y_test, y_pred)
df_predictions = pd.DataFrame(y_samples_pred, columns=["Predicted_Value"])
df_predictions.to_csv("predicted_values.csv", index=False)