# install.packages("torch")
library(torch)
# torch::install_torch()
Building a Simple Neural Network in R with torch
The torch
package brings deep learning to R by providing bindings to the popular PyTorch library. This comprehensive tutorial demonstrates how to build and train a simple neural network using torch
in R.
Installation
A Simple Neural Network
This section focuses on the creation of a neural network to perform a simple regression task.
1. Sample Data
# Set seed for reproducibility
set.seed(42)
# Generate training data: y = 3x + 2 + noise
<- torch_randn(100, 1)
x <- 3 * x + 2 + torch_randn(100, 1) * 0.3
y
# Display the first few data points
head(
data.frame(
x = as.numeric(x$squeeze()),
y = as.numeric(y$squeeze())
))
x y
1 -0.02329975 1.861628
2 1.92341769 7.555232
3 0.11041667 2.613283
4 -2.55959392 -5.931758
5 0.36482519 3.099005
6 0.97125226 4.551073
2. Neural Network Module
The next step involves defining the neural network architecture using torch
’s module system:
# Define a simple feedforward neural network
<- nn_module(
nnet initialize = function() {
# Define layers
$layer1 <- nn_linear(1, 8) # Input layer to hidden layer (1 -> 8 neurons)
self$layer2 <- nn_linear(8, 1) # Hidden layer to output layer (8 -> 1 neuron)
self
},forward = function(x) {
# Define forward pass
%>%
x $layer1() %>% # First linear transformation
selfnnf_relu() %>% # ReLU activation function
$layer2() # Second linear transformation
self
}
)
# Instantiate the model
<- nnet()
model
# Display model structure
print(model)
An `nn_module` containing 25 parameters.
── Modules ─────────────────────────────────────────────────────────────────────
• layer1: <nn_linear> #16 parameters
• layer2: <nn_linear> #9 parameters
3. Set Up the Optimizer and Loss Function
The training process requires defining how the model will learn from the data:
# Set up optimizer (Adam optimizer with learning rate 0.02)
<- optim_adam(model$parameters, lr = 0.02)
optimizer
# Define loss function (Mean Squared Error for regression)
<- nnf_mse_loss loss_fn
4. Training Loop
The neural network training process proceeds as follows:
# Store loss values for plotting
<- numeric(300)
loss_history
# Training loop
for(epoch in 1:300) {
# Set model to training mode
$train()
model
# Reset gradients
$zero_grad()
optimizer
# Forward pass
<- model(x)
y_pred
# Calculate loss
<- loss_fn(y_pred, y)
loss
# Backward pass
$backward()
loss
# Update parameters
$step()
optimizer
# Store loss for plotting
<- loss$item()
loss_history[epoch] }
5. Visualize the Training Progress
The following visualization demonstrates how the loss decreased during training:
# Create a data frame for plotting
<- data.frame(
training_df epoch = 1:300,
loss = loss_history
)
# Plot training loss
ggplot(training_df, aes(x = epoch, y = loss)) +
geom_line(color = "#2c3e50", size = 1) +
labs(
title = "Training Loss Over Time",
subtitle = "Neural Network Learning Progress",
x = "Epoch",
y = "Mean Squared Error Loss"
+
) theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray60")
)
6. Visualize the Results
The following analysis demonstrates how well the trained model performs:
# Set model to evaluation mode
$eval()
model
# Generate predictions
with_no_grad({
<- model(x)
y_pred
})
# Convert to R vectors for plotting
<- as.numeric(x$squeeze())
x_np <- as.numeric(y$squeeze())
y_np <- as.numeric(y_pred$squeeze())
y_pred_np
# Create data frame for ggplot
<- data.frame(
plot_df x = x_np,
y_actual = y_np,
y_predicted = y_pred_np
)
# Create the plot
ggplot(plot_df, aes(x = x)) +
geom_point(aes(y = y_actual, color = "Actual"), alpha = 0.7, size = 2) +
geom_point(aes(y = y_predicted, color = "Predicted"), alpha = 0.7, size = 2) +
geom_smooth(aes(y = y_predicted), method = "loess", se = FALSE,
color = "#e74c3c", linetype = "dashed") +
labs(
title = "Neural Network Regression Results",
subtitle = "Comparing actual vs predicted values",
x = "Input (x)",
y = "Output (y)",
color = "Data Type"
+
) scale_color_manual(values = c("Actual" = "#3498db", "Predicted" = "#e74c3c")) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray60"),
legend.position = "top"
)
7. Model Performance Analysis
The following analysis examines how well the model learned the underlying pattern:
# Calculate performance metrics
<- mean((y_pred_np - y_np)^2)
mse <- sqrt(mse)
rmse <- mean(abs(y_pred_np - y_np))
mae <- cor(y_pred_np, y_np)^2
r_squared
# Create performance summary
<- data.frame(
performance_summary Metric = c("Mean Squared Error", "Root Mean Squared Error",
"Mean Absolute Error", "R-squared"),
Value = c(mse, rmse, mae, r_squared)
)
print(performance_summary)
Metric Value
1 Mean Squared Error 0.09061213
2 Root Mean Squared Error 0.30101848
3 Mean Absolute Error 0.23722124
4 R-squared 0.99000990
# Compare with true relationship (y = 3x + 2)
# Generate predictions on a grid for comparison
<- torch_linspace(-3, 3, 100)$unsqueeze(2)
x_grid with_no_grad({
<- model(x_grid)
y_grid_pred
})
<- as.numeric(x_grid$squeeze())
x_grid_np <- as.numeric(y_grid_pred$squeeze())
y_grid_pred_np <- 3 * x_grid_np + 2
y_grid_true
# Plot comparison
<- data.frame(
comparison_df x = x_grid_np,
y_true = y_grid_true,
y_predicted = y_grid_pred_np
)
ggplot(comparison_df, aes(x = x)) +
geom_line(aes(y = y_true, color = "True Function"), size = 2) +
geom_line(aes(y = y_predicted, color = "Neural Network"), size = 2, linetype = "dashed") +
geom_point(data = plot_df, aes(y = y_actual), alpha = 0.3, color = "gray50") + labs(
title = "Neural Network vs True Function",
subtitle = "Model learning assessment against the underlying pattern",
x = "Input (x)",
y = "Output (y)",
color = "Function Type"
+
) scale_color_manual(values = c("True Function" = "#2c3e50", "Neural Network" = "#e74c3c")) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray60"),
legend.position = "top"
)
Understanding the Neural Network
The following examination reveals what the network learned by analyzing its parameters:
# Extract learned parameters
<- as.matrix(model$layer1$weight$detach())
layer1_weight <- as.numeric(model$layer1$bias$detach())
layer1_bias <- as.matrix(model$layer2$weight$detach())
layer2_weight <- as.numeric(model$layer1$bias$detach())
layer2_bias
cat("First layer (fc1) parameters:\n")
First layer (fc1) parameters:
cat("Weight matrix shape:", dim(layer1_weight), "\n")
Weight matrix shape: 8 1
cat("Bias vector length:", length(layer1_bias), "\n\n")
Bias vector length: 8
cat("Second layer (fc2) parameters:\n")
Second layer (fc2) parameters:
cat("Weight matrix shape:", dim(layer2_weight), "\n")
Weight matrix shape: 1 8
cat("Bias value:", layer2_bias, "\n\n")
Bias value: 0.701076 -0.8832566 -1.28852 0.4193589 0.8179439 -0.4608558 0.6640872 0.2222885
# Display first layer weights and biases
cat("First layer weights:\n")
First layer weights:
print(round(layer1_weight, 4))
[,1]
[1,] 1.2292
[2,] -2.0338
[3,] 0.3231
[4,] 1.4845
[5,] 1.2861
[6,] -0.0174
[7,] 0.1889
[8,] -0.5916
cat("\nFirst layer biases:\n")
First layer biases:
print(round(layer2_bias, 4))
[1] 0.7011 -0.8833 -1.2885 0.4194 0.8179 -0.4609 0.6641 0.2223
Experimenting with Different Architectures
The following section analyzes the simple network against different architectures:
# Define different network architectures
<- function(hidden_sizes) {
create_network nn_module(
initialize = function(hidden_sizes) {
$layers <- nn_module_list()
self
# Input layer
<- 1
prev_size
for(i in seq_along(hidden_sizes)) {
$layers$append(nn_linear(prev_size, hidden_sizes[i]))
self<- hidden_sizes[i]
prev_size
}# Output layer
$layers$append(nn_linear(prev_size, 1))
self
},forward = function(x) {
for(i in 1:(length(self$layers) - 1)) {
<- nnf_relu(self$layers[[i]](x))
x
}# No activation on output layer
$layers[[length(self$layers)]](x)
self
}
)
}
# Train different architectures
<- list(
architectures "Simple (8)" = c(8),
"Deep (16-8)" = c(16, 8),
"Wide (32)" = c(32),
"Very Deep (16-16-8)" = c(16, 16, 8)
)
<- list()
results
for(arch_name in names(architectures)) {
# Create and train model
<- create_network(architectures[[arch_name]])
net_class <- net_class(architectures[[arch_name]])
model_temp <- optim_adam(model_temp$parameters, lr = 0.01)
optimizer_temp
# Quick training (fewer epochs for comparison)
for(epoch in 1:200) {
$train()
model_temp$zero_grad()
optimizer_temp<- model_temp(x)
y_pred_temp <- loss_fn(y_pred_temp, y)
loss_temp $backward()
loss_temp$step()
optimizer_temp
}
# Generate predictions
$eval()
model_tempwith_no_grad({
<- model_temp(x_grid)
y_pred_arch
})
<- data.frame(
results[[arch_name]] x = x_grid_np,
y_pred = as.numeric(y_pred_arch$squeeze()),
architecture = arch_name
)
}
# Combine results
<- do.call(rbind, results)
all_results
# Plot comparison
ggplot(all_results, aes(x = x, y = y_pred, color = architecture)) +
geom_line(size = 1.2) +
geom_line(data = comparison_df, aes(y = y_true, color = "True Function"),
size = 2, linetype = "solid") +
geom_point(data = plot_df, aes(x = x, y = y_actual),
color = "gray50", alpha = 0.3, inherit.aes = FALSE) + labs(
title = "Comparison of Different Neural Network Architectures",
subtitle = "Effects of network depth and width on learning performance",
x = "Input (x)",
y = "Output (y)",
color = "Architecture"
+
) theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray60"),
legend.position = "top"
)
Key Takeaways
- Simple Architecture: Even a simple 2-layer network can learn complex patterns effectively
- Training Process: The importance of proper training loops with gradient computation
- Visualization: Effective methods for visualizing both training progress and results
- Model Evaluation: Understanding model performance through multiple metrics
- Architecture Comparison: How different network structures affect learning capabilities
The torch
package provides a straightforward approach to building and experimenting with neural networks in R, bringing the power of deep learning to the R ecosystem. This approach can be extended to more complex datasets and deeper architectures as needed.