I have cleaned the data for train and test file. I have score 99+ you Here is data https://www.kaggle.com/datasets/muhammadqasimshabbir/cleaned-data/ Please upvote my data set if you find it helpful so that it will easy for other to find. soon i will public the note book as well
Hey, mind sharing the script or methods used for data cleaning as well?
I have group by similar ID then taking the mean aggregation of train and test file Here is script import pandas as pd
import pandas as pd # Load datasets train = pd.read_csv("/kaggle/input/amini-canopy-dataset/Train.csv") test = pd.read_csv("/kaggle/input/amini-canopy-dataset/Test.csv") submission = pd.read_csv("/kaggle/input/amini-canopy-dataset/SampleSubmission.csv") # Fill missing values with 0 train.fillna(0, inplace=True) test.fillna(0, inplace=True) # Merge duplicate IDs by aggregating (e.g., mean, sum, or first occurrence) train_grouped = train.groupby("ID").mean().reset_index() test_grouped = test.groupby("ID").mean().reset_index() # Ensure test IDs match SampleSubmission.csv test_grouped = test_grouped[test_grouped["ID"].isin(submission["ID"])] # Extract target column from train (assuming the target column is named "Target") if "Target" in train.columns: target_df = train[["ID", "Target"]].drop_duplicates(subset=["ID"], keep="first") train_grouped = train_grouped.merge(target_df, on="ID", how="left") # Ensure all features are padded with zero if needed (aligning feature dimensions) max_features = max(len(train_grouped.columns), len(test_grouped.columns)) train_grouped = train_grouped.reindex(columns=train_grouped.columns.tolist() + ["Padding"] * (max_features - len(train_grouped.columns)), fill_value=0) test_grouped = test_grouped.reindex(columns=test_grouped.columns.tolist() + ["Padding"] * (max_features - len(test_grouped.columns)), fill_value=0) # Print final test length print(f"Final test dataset length: {len(test_grouped)}") # Save cleaned train and test files train_grouped.to_csv("Train_Cleaned.csv", index=False) test_grouped.to_csv("Test_Cleaned.csv", index=False)