Let's say our final prediction is preds_df, is it possible to perform the following operation in the prediction file?
path = '' DF = pd.concat([ pd.read_csv(path+'Train.csv').assign(Tr=1), pd.read_csv(path+'Test.csv').assign(Tr=0) ])
cols = ['MERCHANT_NAME', 'PURCHASE_VALUE', 'PURCHASED_AT', 'IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY', 'USER_AGE', 'USER_GENDER', 'USER_HOUSEHOLD', 'USER_INCOME', 'USER_ID']
DUP = DF[DF.duplicated(cols, keep=False)].drop(columns=['MERCHANT_CATEGORIZED_AT', 'Transaction_ID']).sort_values(by=['PURCHASED_AT'])
for pur_time in DUP['PURCHASED_AT'].unique(): ts_index = [] real_val = None curr = DUP[DUP.PURCHASED_AT == pur_time] for i in curr.index: if curr.loc[i]['Tr']: # if train real_val = curr.loc[i]['MERCHANT_CATEGORIZED_AS'] else: ts_index.append(i) if real_val: for inx in ts_index: for col in preds_df.columns[1:]: if col == real_val: preds_df.loc[inx, col] = 1 # set the category to 1 if it is found in training set else: preds_df.loc[inx, col] = 0