👉 This code will take approximately12+ hours on a GPU.
I built a pipeline using EfficientNetV2 as the backbone, incorporating metadata-based feature engineering and an attention mechanism. This combination enhances the model’s ability to distinguish between different installation environments and accurately quantify solar technology adoption.
# -*- coding: utf-8 -*-
import os
import timm
import torch
import torch.nn as nn
import torch.optim as optim
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import cv2
from sklearn.model_selection import KFold
import numpy as np
from tqdm import tqdm
from torch.amp import autocast, GradScaler # Fixed import
from sklearn.metrics import mean_absolute_error
# Fixed Albumentations version warning
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'
# Enhanced Dataset with Metadata
class SolarPanelDataset(Dataset):
def __init__(self, dataframe, transform=None, to_train=True):
self.dataframe = dataframe
self.transform = transform
self.to_train = to_train
self.placement_map = {"roof": 0, "openspace": 1, "r_openspace": 2, "S-unknown": 3}
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
row = self.dataframe.iloc[idx]
image = cv2.imread(row["path"])
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Correct color conversion
# Improved metadata encoding
metadata = torch.zeros(5)
metadata[0] = 1.0 if row["img_origin"] == "D" else 0.0
placement = self.placement_map.get(row["placement"], 3)
metadata[1 + placement] = 1.0 # One-hot encoding
if self.transform:
image = self.transform(image=image)['image']
if self.to_train:
target = torch.tensor([row["boil_nbr"], row["pan_nbr"]], dtype=torch.float32)
return image, metadata, target
return image, metadata
# Model with Metadata
class EfficientNetV2Meta(nn.Module):
def __init__(self):
super().__init__()
self.backbone = timm.create_model("tf_efficientnetv2_b3", pretrained=True, num_classes=0) # you can even try Larger backbone
self.meta_processor = nn.Sequential(
nn.Linear(5, 128),
nn.LayerNorm(128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, 64)
)
self.attention = nn.MultiheadAttention(embed_dim=64, num_heads=4)
self.regressor = nn.Sequential(
nn.Linear(self.backbone.num_features + 64, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, 2),
nn.Softplus() # Better for count predictions
)
def forward(self, image, metadata):
img_features = self.backbone(image)
meta_features = self.meta_processor(metadata.unsqueeze(0))
attn_output, _ = self.attention(meta_features, meta_features, meta_features)
combined = torch.cat([img_features, attn_output.squeeze(0)], dim=1)
return self.regressor(combined)
# Advanced Augmentation
train_transform = A.Compose([
A.RandomResizedCrop(512, 512, scale=(0.7, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.5),
A.RandomRotate90(p=0.5),
A.GaussianBlur(blur_limit=(3, 7), p=0.3),
A.CLAHE(clip_limit=4.0, p=0.5),
A.HueSaturationValue(p=0.3),
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensorV2()
])
test_transform = A.Compose([
A.Resize(512, 512),
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensorV2()
])
# Training Configuration
def train(fold=0, epochs=20, batch_size=16):
train_df = pd.read_csv("Train.csv")
train_df = train_df.groupby("ID").agg({
"boil_nbr": "sum",
"pan_nbr": "sum",
"img_origin": "first",
"placement": "first"
}).reset_index()
train_df["path"] = "images/" + train_df["ID"] + ".jpg"
kf = KFold(n_splits=5, shuffle=True, random_state=42)
splits = list(kf.split(train_df))
train_idx, val_idx = splits[fold]
train_ds = SolarPanelDataset(train_df.iloc[train_idx], transform=train_transform)
val_ds = SolarPanelDataset(train_df.iloc[val_idx], transform=test_transform)
train_loader = DataLoader(train_ds, batch_size=batch_size,
shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=batch_size*2,
shuffle=False, num_workers=4, pin_memory=True)
model = EfficientNetV2Meta().cuda()
criterion = nn.HuberLoss(delta=1.0) # Improved loss function
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
scaler = GradScaler()
best_mae = float('inf')
for epoch in range(epochs):
# Training loop
model.train()
train_loss = 0.0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
for images, meta, targets in pbar:
images = images.cuda(non_blocking=True)
meta = meta.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
optimizer.zero_grad()
with autocast(device_type='cuda'):
outputs = model(images, meta)
loss = criterion(outputs, targets)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
train_loss += loss.item()
pbar.set_postfix(loss=loss.item())
# Validation loop
model.eval()
val_loss = 0.0
preds, truths = [], []
with torch.no_grad():
for images, meta, targets in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]"):
images = images.cuda(non_blocking=True)
meta = meta.cuda(non_blocking=True)
targets = targets.cuda(non_blocking=True)
with autocast(device_type='cuda'):
outputs = model(images, meta)
loss = criterion(outputs, targets)
val_loss += loss.item()
preds.append(outputs.cpu().numpy())
truths.append(targets.cpu().numpy())
# Metrics calculation
train_loss /= len(train_loader)
val_loss /= len(val_loader)
preds = np.concatenate(preds)
truths = np.concatenate(truths)
mae = mean_absolute_error(truths, preds)
print(f"Epoch {epoch+1}/{epochs}")
print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val MAE: {mae:.4f}")
# Model checkpointing based on MAE
if mae < best_mae:
best_mae = mae
torch.save(model.state_dict(), f"best_model_fold{fold}.pth")
scheduler.step()
return best_mae
# Inference with TTA
def predict(test_df, model_paths, batch_size=32):
test_df["path"] = "images/" + test_df["ID"] + ".jpg"
test_ds = SolarPanelDataset(test_df, transform=test_transform, to_train=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=4)
predictions = np.zeros((len(test_df), 2))
for path in model_paths:
model = EfficientNetV2Meta().cuda()
model.load_state_dict(torch.load(path, weights_only=True)) # Safer loading
model.eval()
tta_preds = []
with torch.no_grad():
for images, meta in tqdm(test_loader, desc="Inference"):
images = images.cuda()
meta = meta.cuda()
with autocast(device_type='cuda'):
outputs = model(images, meta)
tta_preds.append(outputs.cpu().numpy())
predictions += np.concatenate(tta_preds)
return predictions / len(model_paths)
# Main Execution
if __name__ == "__main__":
# Train multiple folds
folds = 5
model_paths = []
for fold in range(folds):
print(f"Training fold {fold+1}/{folds}")
best_mae = train(fold=fold, epochs=52, batch_size=32)
model_paths.append(f"best_model_fold{fold}.pth")
# Prepare submission
test_df = pd.read_csv("Test.csv")
predictions = predict(test_df, model_paths, batch_size=64)
# Create submissions
submission = pd.DataFrame({
"ID": np.repeat(test_df["ID"].values, 2),
"Target": predictions.flatten()
})
submission["ID"] += np.where(
submission.groupby("ID").cumcount() == 0,
"_boil",
"_pan"
)
submission.to_csv("submission_original.csv", index=False)
int_submission = submission.copy()
int_submission["Target"] = np.round(int_submission["Target"]).astype(int)
int_submission.to_csv("submission_integer.csv", index=False)
print("Submissions saved with shapes:", submission.shape, int_submission.shape)
I hope this will help . For further improvments try Vision tansformers and larger backbones and more epcohs .
👉 This code will take approximately12+ hours on a GPU.
Upvote post and kaggle notebook if you find the code helpful.
Kaggle Link:https://www.kaggle.com/code/johndoe2011/efficientnetv2-lacuna-solar-panel
Wow, I am perplexed. Thank you so much @zulo40. I have upvoted it.
if i maintained my rank i will surely share the code after the competation
wow! This is amazing!!
I like your code so muchat that I made a video for it : )
https://www.youtube.com/watch?v=CyYt5ufgkBA
Thank you again for sharing it!
Thanks buddy , CAn youtell me how did you add that voice .its good and totally aligned
LOL it's a secret :P
Thanks @everyone for making this most upvoted post .This motivates me to sahre other approaches in future too.
I wish i can upvote 10 times :)
you can use the code below to compute cv score. Very nice cv score!
train_df = pd.read_csv("/raid/ml/solar/Train.csv")train_df = train_df.groupby("ID").agg({model.load_state_dict(torch.load(f"best_model_fold{fold}.pth"))for images, meta in tqdm(val_loader, desc=f"Predicting Fold {fold}"):print(f"Validation MAE - Boil: {mae_boil:.4f}, Pan: {mae_pan:.4f}, Overall: {overall_mae:.4f}")print(f"MAE: {mae:.2f}")print(f"MAE (pan): {mae_pan:.2f}")print(f"MAE (boil): {mae_boil:.2f}")@snow whats the lb score for this cv
I didn't submit. I think it is around 0.96 lb. please note that this is using all 5 folds. the original code trained 3 out of 5 folds. but the difference is small < 0.02 mae
The original code didn't set random state so it can vary. i had a way off score when I tried to check @zulo40's notebook.
I noticed that gpu utilization is not high due to reading the image for every __get_item__() call. Instead I read all the images only once and cached them and i have 3x speedup in training. I also resized the image to 1280,720 so caching cost 13GB system memory. you can tune this for your memory. just be mindful that later we have a 512,512 random crop so don't resize too much.
self.placement_map = {"roof": 0, "openspace": 1, "r_openspace": 2, "S-unknown": 3}self.images = {}print("Caching images...")print(f"Warning: Unable to read image at {row['path']}")print(f"Error loading image at index {idx}: {e}")print(f"Successfully cached {len(self.images)} out of {len(dataframe)} images")