df = pd.read_csv('Train.csv',parse_dates=['datetime']) df=df.drop('uid',axis=1) df=df.drop_duplicates()for j in range(3,22,3): d = df[(df['datetime'].dt.hour>=0) & (df['datetime'].dt.hour<j)] d=d.reset_index(drop=True) for i in range(d.shape[0]): if((d['latitude'][i]<d['latitude'].median()-0.23) or (d['latitude'][i]>d['latitude'].median()+0.3)): d['latitude'][i]=np.NaN if((d['longitude'][i]>d['longitude'].median()+0.38) or (d['longitude'][i]<d['longitude'].median()-0.18)): d['longitude'][i]=np.NaN d=d.dropna() kmeans = KMeans(n_clusters=6, init='k-means++', n_init=50, max_iter=50000, tol=0.0000001, precompute_distances='auto', verbose=0, random_state=100, copy_x=True, n_jobs=220, algorithm='full').fit(d[['latitude','longitude']]) t = flaten(kmeans.cluster_centers_) ss.loc[ss['date'].dt.hour == j, ['A0_Latitude','A0_Longitude', 'A1_Latitude','A1_Longitude','A2_Latitude','A2_Longitude', 'A3_Latitude','A3_Longitude','A4_Latitude','A4_Longitude', 'A5_Latitude','A5_Longitude']] = t ss.to_csv('kmeans_by time.csv',index=False)