Module came.Part_2_functions_for_eachstep
Functions
def create_folder(inputpath)
-
Expand source code
def create_folder(inputpath): """ This function is for creating folders Args: inputpath: The folder path Returns: True: Omitted """ if not os.path.exists(inputpath): os.makedirs(inputpath)
This function is for creating folders
Args
inputpath
- The folder path
Returns
True
- Omitted
def file_names(inputpath)
-
Expand source code
def file_names(inputpath): """ This function is for looping over files Args: inputpath: The folder path to be looped over Returns: namelist: The files list """ namelist = [] filePath = inputpath for i, j, k in os.walk(filePath): namelist.append([i, j, k]) return namelist
This function is for looping over files
Args
inputpath
- The folder path to be looped over
Returns
namelist
- The files list
def gam(save_path, csv_name, key)
-
Expand source code
def gam(save_path, csv_name, key): """ This function is for gam algorithm Args: save_path: The path for saving the result file csv_name: The species name being processed key: The number for file naming Returns: Lon: The longitude after GAM fitting Lat: The latitude after GAM fitting """ df = pd.read_csv(os.path.join(save_path, csv_name.replace('.csv', ''), 'group{}.csv'.format(key + 1))) date = df["date_index"] x = df["X"] y = df["Y"] xx = df["date_index"] xx = xx + 43103 gam_model_x = LinearGAM().fit(date, x) gam_model_y = LinearGAM().fit(date, y) predictions_x = gam_model_x.predict(date) predictions_y = gam_model_y.predict(date) # Draw the pictures plt.figure(figsize=(16, 8)) plt.subplot(1, 2, 1) plt.scatter(xx, y, color='darkorange', label='data') plt.plot(xx, predictions_y, color='navy', lw=2, label='GAM') # plt.plot(X_all, y_gb_longitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Longitude(meter)') #plt.title('Longitude') plt.legend() plt.subplot(1, 2, 2) plt.scatter(xx, x, color='darkorange', label='data') plt.plot(xx, predictions_x, color='navy', lw=2, label='GAM') # plt.plot(X_all, y_gb_latitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Latitude(meter)') #plt.title('Latitude') plt.legend() plt.tight_layout() # plt.show() plt.savefig(os.path.join(save_path, csv_name.replace('.csv', ''), 'gam{}.jpg'.format(key + 1))) plt.close() # Calculate the indexes mse_longitude = mean_squared_error(y, predictions_y) mse_latitude = mean_squared_error(x, predictions_x) rmse_longitude = np.sqrt(mse_longitude) rmse_latitude = np.sqrt(mse_latitude) r2_score_longitude = r2_score(y, predictions_y) r2_score_latitude = r2_score(x, predictions_x) try: readcsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index')) except: savecsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index'), ['mse_longitude', 'mse_latitude', 'rmse_longitude', 'rmse_latitude', 'r2_score_longitude', 'r2_score_latitude']) savecsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index'), [mse_longitude, mse_latitude, rmse_longitude, rmse_latitude, r2_score_longitude, r2_score_latitude]) datas = [["X*", "Y*", "date_index"]] # datas = [] for i in range(len(x)): datas.append([predictions_x[i], predictions_y[i], date[i]]) # data_write(os.path.join(save_path, csv_name.replace('.csv', ''), 'result_{}.xls'.format(key + 1)), datas) savecsvs(os.path.join(save_path, csv_name.replace('.csv', ''), 'fitting_result{}.csv'.format(key + 1)), datas) # Coordinate conversion to wgs84 Lon,Lat = projection2wgs84(predictions_y,predictions_x) return Lon, Lat
This function is for gam algorithm
Args
save_path
- The path for saving the result file
csv_name
- The species name being processed
key
- The number for file naming
Returns
Lon
- The longitude after GAM fitting
Lat
- The latitude after GAM fitting
def get_all_csv(data_path)
-
Expand source code
def get_all_csv(data_path): """ This function is for getting all csv files Args: data_path: The folder path of storing the csv files Returns: all_csv: All the csv files """ excel_list = file_names(data_path) all_csv = [] for i in range(len(excel_list)): folder_attribute = excel_list[i] if len(folder_attribute[2])>0: for fileName in folder_attribute[2]: if fileName[-1] == 'v': all_csv.append(fileName) return all_csv
This function is for getting all csv files
Args
data_path
- The folder path of storing the csv files
Returns
all_csv
- All the csv files
def get_initial_data(x, y, date, length, projection1)
-
Expand source code
def get_initial_data(x,y,date,length,projection1): global projection projection = projection1 """ This function is for converting the geographical coordinates Args: x: The geographical coordinates of latitude y: The geographical coordinates of longitude date: The observation date length: The length of the data projection1: The EPSG code Returns: initial_data: The coordinates after converting """ initial_data = [] initial_data.append(["LATITUDE", "LONGITUDE", "OBSERVATION DATE"]) for i in range(length): result = wgs84toprojection(x[i], y[i]) initial_data.append([result[0], result[1], date[i]]) return initial_data
def get_sldf(window_data_df, save_path, csv_name)
-
Expand source code
def get_sldf(window_data_df,save_path,csv_name): """ This function is for saving the results after sldf outlier detection Args: window_data_df: The data after rolling_window save_path: The path for saving the result file csv_name: The species name being processed Returns: sldf_df: The data results after sldf outlier detection """ xall = window_data_df.values.astype(float) SLDF_all = np.zeros((0, 4)) day_index = dict() day_index[43104] = 0 for index, i in enumerate(xall[:, 2]): if i not in day_index: day_index[i] = index for day in range(1, 360): date = day + 43103 if day != 359: temp = xall[day_index[date]:day_index[date + 1], :2] else: temp = xall[day_index[date]:, :2] outl = sldf(temp) t = date * np.ones((outl.shape[0], 1)) outl = np.concatenate((outl, t), axis=1) SLDF_all = np.concatenate((SLDF_all, outl), axis=0) new_columns = ["LATITUDE", "LONGITUDE", "SLDF", "OBSERVATION DATE"] SLDF_df = pd.DataFrame(SLDF_all, columns=new_columns) SLDF_df.to_csv(os.path.join(save_path, csv_name.replace('.csv',''), 'sldf.csv'), index=False) return SLDF_df
This function is for saving the results after sldf outlier detection
Args
window_data_df
- The data after rolling_window
save_path
- The path for saving the result file
csv_name
- The species name being processed
Returns
sldf_df
- The data results after sldf outlier detection
def group(csv_path, save_path, csv_name)
-
Expand source code
def group(csv_path,save_path,csv_name): """ This function is for grouping the daily subpopulation centroids based on the minimum distance Args: csv_path: The path for the data file after Meanshift algorithm save_path: The path for saving the result file csv_name: The species name being processed Returns: True: Omitted """ A1 = np.array([[0], [0]]) A3 = np.array([[0], [0]]) datas = pd.read_csv(csv_path) #datas = datas.iloc[1:, :] result_list = [] for date in range(43104, 43463): data = datas.loc[date == datas['OBSERVATION DATE']] data = data.iloc[:, :2].values.tolist() A1 = np.hstack((A1, np.array(list(data)).T)) A3 = np.hstack((A3, np.array([[int(len(data))], [A3[1, -1] + int(len(data))]]))) A1 = np.delete(A1, 0, axis=1) A3 = np.delete(A3, 0, axis=1) np.save("A1.npy", A1) np.save("A3.npy", A3) # A1 = np.load('A1.npy') # A3 = np.load('A3.npy') p2 = 0 N3 = A3.shape[1] LL4 = 0 dddd = 0 LL5 = 0 LL6 = 0 zhongjian = {} KKK2 = np.zeros((1, 10000)) KKK3 = np.zeros((1, 10000)) new1 = np.zeros((2, 1000)) guiji = {} abc = np.zeros((10000, 10000), dtype=int) #Traversal calculations of the centroid distance between two adjacent days in the annual circle for b1 in range(2, N3): if LL6 > 0: LL1 = A3[1, b1 - 2] LL2 = A3[1, b1 - 1] LL3 = A3[1, b1] O1 = A3[0, b1 - 2] O2 = new2 O3 = A3[0, b1] KK1 = A1[:, np.arange(LL4, LL1)] KK2 = new1 KK3 = A1[:, np.arange(LL2, LL3)] LL4 = LL1 LL5 = LL5 + 1 if LL6 == 0: LL1 = A3[1, b1 - 2] LL2 = A3[1, b1 - 1] LL3 = A3[1, b1] O1 = A3[0, b1 - 2] O2 = A3[0, b1 - 1] O3 = A3[0, b1] KK1 = A1[:, np.arange(LL4, LL1)] KK2 = A1[:, np.arange(LL1, LL2)] KK3 = A1[:, np.arange(LL2, LL3)] LL4 = LL1 LL5 = LL5 + 1 #Store centroid coordinates guodu1 = KK2.shape[1] for pp1 in range(1, guodu1 + 1): kk2 = KK2[:, pp1 - 1] kk2 = np.transpose(kk2) KKK2[:, np.arange(pp1 * 2 - 2, pp1 * 2)] = kk2 guodu2 = KK3.shape[1] for pp1 in range(1, guodu2 + 1): kk3 = KK3[:, pp1 - 1] kk3 = np.transpose(kk3) KKK3[:, np.arange(pp1 * 2 - 2, pp1 * 2)] = kk3 #Perform centroid distance traversal calculation when the number of centroids on the later day is greater than the previous day #The calculation order needs to consider from O2 to O3 and from O3 to O2 to ensure that all centroids in O3 can be connected. jl = np.zeros((100, 100)) if O3 > O2: new2 = O3 dddd = dddd + 1 for t1 in range(1, O2 + 1): for t2 in range(1, O3 + 1): jl[t1 - 1, t2 - 1] = np.sqrt( (KK2[0, t1 - 1] - KK3[0, t2 - 1]) ** 2 + (KK2[1, t1 - 1] - KK3[1, t2 - 1]) ** 2) index = np.argmin(jl[:O2, :O3], axis=1) for t3 in range(1, O2 + 1): aaa = int(index[t3 - 1] + 1) if dddd == 1: shuju1 = np.vstack( (KKK2[:, np.arange(t3 * 2 - 2, t3 * 2)], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) new1[:, t3 - 1] = KK3[:, aaa - 1] LL6 = LL6 + 1 if dddd > 1: shuju1 = np.vstack((guiji[t3 - 1], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) new1[:, t3 - 1] = KK3[:, aaa - 1] zhongjian[t3 - 1] = shuju1 # zhongjian[t3-1][np.all(shuju1 == 0, axis=1),:].fill(0) jl = np.zeros((100, 100)) for t1 in range(1, O3 + 1): for t2 in range(1, O2 + 1): jl[t1 - 1, t2 - 1] = np.sqrt( (KK3[0, t1 - 1] - KK2[0, t2 - 1]) ** 2 + (KK3[1, t1 - 1] - KK2[1, t2 - 1]) ** 2) index = np.argmin(jl[:O3, :O2], axis=1) XX = np.unique(index) nnp = O3 - O2 nnn = 1 for i in range(1, len(XX) + 1): m = (index == XX[i - 1]).nonzero()[0] if len(m) >= 2: abc[nnn - 1, 0] = XX[i - 1] + 1 abc[nnn - 1, 1] = len(m) nnn = nnn + 1 if len(m) >= 2: for nnc in range(1, nnp + 1): if nnn - 1 > 0: aaa = int(index[abc[nnn - 2, 0]] + 1) if nnn - 1 > 1: aaa = int(index[abc[nnn - 2, 0]] + 1) nnn = nnn - 1 if dddd > 1: shuju1 = np.vstack((guiji[abc[nnn - 2, 0]], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) if dddd == 1: shuju1 = np.vstack((KKK2[:, np.arange(abc[nnn - 2, 0] * 2 - 2, abc[nnn - 2, 0] * 2)], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) new1[:, O2 + nnc - 1] = KK3[:, aaa - 1] zhongjian[O2 + nnc - 1] = shuju1 # zhongjian[O2 + nnc-1][np.all(shuju1 == 0,axis=1),:].fill(0) for t10 in range(0, O3): guiji[t10] = zhongjian[t10] #Perform centroid distance traversal calculation when the number of centroids on the later day is less than the previous day if O3 <= O2: new2 = O2 dddd = dddd + 1 for t1 in range(1, O2 + 1): for t2 in range(1, O3 + 1): jl[t1 - 1, t2 - 1] = np.sqrt( (KK2[0, t1 - 1] - KK3[0, t2 - 1]) ** 2 + (KK2[1, t1 - 1] - KK3[1, t2 - 1]) ** 2) index = np.argmin(jl[:O2, :O3], axis=1) for t3 in range(1, O2 + 1): aaa = int(index[t3 - 1] + 1) if dddd == 1: shuju1 = np.vstack( (KKK2[:, np.arange(t3 * 2 - 2, t3 * 2)], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) new1[:, t3 - 1] = KK3[:, aaa - 1] LL6 = LL6 + 1 if dddd > 1: shuju1 = np.vstack((guiji[t3 - 1], KKK3[:, np.arange(aaa * 2 - 2, aaa * 2)])) new1[:, t3 - 1] = KK3[:, aaa - 1] LL6 = LL6 + 1 guiji[t3 - 1] = shuju1 # guiji[t3][np.all(shuju1 == 0, axis=1),:].fill(0) for key, value in guiji.items(): value = np.hstack((np.array(value), np.arange(1, len(value) + 1).reshape((len(value), 1)))) df1 = value.tolist() # df = pd.DataFrame(value) names = [['X', 'Y', 'date_index']] + df1 # df.columns = names # df.to_excel(os.path.join(save_path, csv_name.replace('.csv',''), 'ni_traj{}.xlsx'.format(key + 1)), sheet_name='Sheet1', index=False) savecsvs(os.path.join(save_path, csv_name.replace('.csv',''), 'group{}.csv'.format(key + 1)),names)
This function is for grouping the daily subpopulation centroids based on the minimum distance
Args
csv_path
- The path for the data file after Meanshift algorithm
save_path
- The path for saving the result file
csv_name
- The species name being processed
Returns
True
- Omitted
def interpolation(date, length, initial_data)
-
Expand source code
def interpolation(date,length,initial_data): """ This function is to finish interpolation for missing dates Args: date: The observation date length: The length of the data initial_data: The data after coordinates conversion Returns: initial_data: The data results after interpolation """ k = 43101 lose_date = [] now_date = [] all_date = [i for i in range(k, k + 365)] for i in range(length): now_date.append(date[i]) for i in all_date: if i not in now_date: lose_date.append(i) # print(lose_date) for lose in lose_date: x = [] y = [] for data in initial_data: if data[2] == "OBSERVATION DATE": continue if 0 < lose - int(data[2]) <= 2: x.append(data[0]) y.append(data[1]) initial_data.append([sum(x) / len(x), sum(y) / len(y), lose]) return initial_data
This function is to finish interpolation for missing dates
Args
date
- The observation date
length
- The length of the data
initial_data
- The data after coordinates conversion
Returns
initial_data
- The data results after interpolation
def knn(save_path, csv_name, key, n_neighbors)
-
Expand source code
def knn(save_path, csv_name,key,n_neighbors): """ This function is for KNN algorithm Args: save_path: The path for saving the result file csv_name: The species name being processed key: The number for file naming n_neighbors: The number of neighbors Returns: Lon: The longitude after KNN fitting Lat: The latitude after KNN fitting """ df = pd.read_csv(os.path.join(save_path, csv_name.replace('.csv', ''), 'group{}.csv'.format(key + 1))) date = df["date_index"] xx = df["date_index"] xx = xx + 43103 X = df[['date_index']] # Features (date) y_longitude = df['Y'] # Target variable (longitude) y_latitude = df['X'] # Target variable (latitude) # Train the model knn_longitude = KNeighborsRegressor(n_neighbors=n_neighbors) knn_latitude = KNeighborsRegressor(n_neighbors=n_neighbors) knn_longitude.fit(X, y_longitude) knn_latitude.fit(X, y_latitude) y_knn_longitude_pred = knn_longitude.predict(X) y_knn_latitude_pred = knn_latitude.predict(X) # Draw the pictures plt.figure(figsize=(16, 8)) plt.subplot(1, 2, 1) plt.scatter(xx, y_longitude, color='darkorange', label='data') plt.plot(xx, y_knn_longitude_pred, color='navy', lw=2, label='KNN') # plt.plot(X_all, y_gb_longitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Longitude(meter)') #plt.title('Longitude') plt.legend() plt.subplot(1, 2, 2) plt.scatter(xx, y_latitude, color='darkorange', label='data') plt.plot(xx, y_knn_latitude_pred, color='navy', lw=2, label='KNN') # plt.plot(X_all, y_gb_latitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Latitude(meter)') #plt.title('Latitude') plt.legend() plt.tight_layout() # plt.show() plt.savefig(os.path.join(save_path, csv_name.replace('.csv', ''), 'KNN{}.jpg'.format(key + 1))) plt.close() # Calculate the indexes mse_longitude = mean_squared_error(y_longitude, y_knn_longitude_pred) mse_latitude = mean_squared_error(y_latitude, y_knn_latitude_pred) rmse_longitude = np.sqrt(mse_longitude) rmse_latitude = np.sqrt(mse_latitude) r2_score_longitude = r2_score(y_longitude, y_knn_longitude_pred) r2_score_latitude = r2_score(y_latitude, y_knn_latitude_pred) try: readcsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index')) except: savecsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index'), ['mse_longitude', 'mse_latitude', 'rmse_longitude', 'rmse_latitude', 'r2_score_longitude', 'r2_score_latitude']) savecsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index'), [mse_longitude, mse_latitude, rmse_longitude, rmse_latitude, r2_score_longitude, r2_score_latitude]) datas = [["X*", "Y*","date_index"]] # datas = [] for i in range(len(date)): datas.append([y_knn_latitude_pred[i], y_knn_longitude_pred[i],date[i]]) # data_write(os.path.join(save_path, csv_name.replace('.csv',''), 'result_{}.xls'.format(key + 1)), datas) savecsvs(os.path.join(save_path, csv_name.replace('.csv',''), 'fitting_result{}.csv'.format(key + 1)), datas) #Coordinate conversion to wgs84 Lon, Lat = projection2wgs84(y_knn_longitude_pred, y_knn_latitude_pred) return Lon, Lat
This function is for KNN algorithm
Args
save_path
- The path for saving the result file
csv_name
- The species name being processed
key
- The number for file naming
n_neighbors
- The number of neighbors
Returns
Lon
- The longitude after KNN fitting
Lat
- The latitude after KNN fitting
def map_1(save_path, csv_name, type_name)
-
Expand source code
def map_1(save_path,csv_name,type_name, ): """ This function is for showing the trajectories on the map Args: save_path: The path for storing the result figures csv_name: The name of the species to be processed type_name: The fitting model chosen for centroids fitting Returns: True: Omitted """ # plt.rcParams['figure.figsize'] = (28, 8) # plt.show() excel_list = os.listdir(os.path.join(save_path, csv_name.replace('.csv', ''))) excel_list1 = [] for csv_excel in excel_list: if 'group' in csv_excel: excel_list1.append(csv_excel) LON = [] LAT = [] for i in range(len(excel_list1)): if type_name == 'gam': Lon, Lat = gam(save_path, csv_name, i) elif type_name == 'randomforest': n_estimators = 100 random_state = 42 Lon, Lat = randomforest(save_path, csv_name, i,n_estimators,random_state) elif type_name == 'knn': n_neighbors = 5 Lon, Lat = knn(save_path, csv_name, i,n_neighbors) LON.append(Lon) LAT.append(Lat) m = Basemap(llcrnrlat=-60, urcrnrlat=90, llcrnrlon=-180, urcrnrlon=-20) # Instantiate a map m.drawcoastlines() # Draw the coastline m.drawmapboundary(fill_color='white') m.fillcontinents(lake_color='white') # Draw the continents and fill them in white parallels = np.arange(-90., 90., 10.) # Draw latitudes with ranges [-90,90] and intervals of 10 m.drawparallels(parallels, labels=[False, True, True, False], color='none') meridians = np.arange(-180., 180., 20.) # Draw the longitude with a range of [-180,180] and an interval of 10 m.drawmeridians(meridians, labels=[True, False, False, True], color='none') for doc in range(0, len(LON)): colorMap = ['red', 'darkorange', 'gold', 'greenyellow', 'pink', 'limegreen', 'mediumturquoise', 'dodgerblue', 'navy', 'blue', 'mediumorchid', 'fuchsia'] # Show labels label = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] marker = ['x', '.', 'o', '|', '*', '.', '<', '>', ',', '.', '.', 'v', 'x', 'o', '|', '*', '<', '^', '.', '*', 'v', '*', ',', 'y', '.', '.', '.', '.'] j = 0 # print(len(lon)) flag = True for i in range(0, len(LON[doc]) - 30, 30): # print(i) if doc == 0: m.plot(LON[doc][i:i + 30], LAT[doc][i:i + 30], marker=marker[doc], linewidth=0.4, color=colorMap[j], markersize=0.5, label=label[ j]) # plt.show() j += 1 if j == 12: j = 0 if flag: plt.legend(loc='lower left', shadow=True) flag = False continue else: m.plot(LON[doc][i:i + 30], LAT[doc][i:i + 30], marker=marker[doc], linewidth=0.4, color=colorMap[j], markersize=0.5) # plt.show() j += 1 if j == 12: j = 0 if flag: plt.legend(loc='lower left', shadow=True) flag = False continue plt.xlabel('Lon', labelpad=10) plt.ylabel('Lat') plt.savefig(os.path.join(save_path, csv_name.replace('.csv', ''), 'trajectories.jpg'), dpi=1000) # plt.show() plt.close()
This function is for showing the trajectories on the map
Args
save_path
- The path for storing the result figures
csv_name
- The name of the species to be processed
type_name
- The fitting model chosen for centroids fitting
Returns
True
- Omitted
def mean_shift(SLDF_df, save_path, csv_name)
-
Expand source code
def mean_shift(SLDF_df,save_path,csv_name): """ This function is for getting centroids of high-density subgroups by Meanshift algorithm Args: SLDF_df: The data after sldf outlier detection save_path: The path for saving the result file csv_name: The species name being processed Returns: result: The data results after Meanshift clustering """ # datas = pd.read_excel('data/clean_window_data.xlsx') datas = SLDF_df.drop(['SLDF'], axis=1) result = [] result.append(["LATITUDE", "LONGITUDE", "OBSERVATION DATE"]) for date in tqdm(range(43104, 43463)): #for date in range(43101, 43119): #print(date) data = datas.loc[date == datas['OBSERVATION DATE']] # .values.tolist()#["answer"] data = data.iloc[:, :2] data = np.array(data) if len(data) == 0: continue ms = MeanShift() ms.fit(data) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters = len(labels_unique) for c in cluster_centers: result.append([float(c[0]), float(c[1]), date]) colors = cycle('bcmyk') if date % 10 == 0: for k, color in zip(range(n_clusters), colors): # current_member indicates true if the label is k and false if not current_member = labels == k cluster_center = cluster_centers[k] # Draw plots plt.plot(data[current_member, 0], data[current_member, 1], color + '.') # Draw circles plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=color, markeredgecolor='k', markersize=14) plt.xlabel('Latitude(meter)') plt.ylabel('Longitude(meter)') # plt.show() plt.savefig(os.path.join(save_path, csv_name.replace('.csv', ''), 'centroids_{}.jpg'.format(date)), dpi=1000) plt.close() return result
This function is for getting centroids of high-density subgroups by Meanshift algorithm
Args
SLDF_df
- The data after sldf outlier detection
save_path
- The path for saving the result file
csv_name
- The species name being processed
Returns: result: The data results after Meanshift clustering
def projection2wgs84(lat, lon)
-
Expand source code
def projection2wgs84(lat, lon): """ This function is for coordinates conversion into geographical ones Args: a: The coordinates of longitude b: The coordinates of latitude Returns: lon: The geographical coordinates of longitude lat: The geographical coordinates of latitude """ global projection crs_WGS84 = CRS.from_epsg(4326) crs_projection = CRS.from_epsg(projection) transformer = Transformer.from_crs(crs_projection, crs_WGS84) m, n = transformer.transform(lat, lon) return n, m
This function is for coordinates conversion into geographical ones
Args
a
- The coordinates of longitude
b
- The coordinates of latitude
Returns
lon
- The geographical coordinates of longitude
lat
- The geographical coordinates of latitude
def randomforest(save_path, csv_name, key, n_estimators, random_state)
-
Expand source code
def randomforest(save_path, csv_name,key,n_estimators,random_state): """ This function is for Random forests algorithm Args: save_path: The path for saving the result file csv_name: The species name being processed key: The number for file naming n_estimators: The number of trees random_state: Randomness Returns: Lon: The longitude after Random Forests algorithm Lat: The latitude after Random Forests algorithm """ df = pd.read_csv(os.path.join(save_path, csv_name.replace('.csv', ''), 'group{}.csv'.format(key + 1))) date = df["date_index"] xx = df["date_index"] xx = xx + 43103 X = df[['date_index']] # Features (date) y_longitude = df['Y'] # Target variable (longitude) y_latitude = df['X'] # Target variable (latitude) # Train the model rf_longitude = RandomForestRegressor(n_estimators=n_estimators,random_state=random_state) rf_latitude = RandomForestRegressor(n_estimators=n_estimators,random_state=random_state) rf_longitude.fit(X, y_longitude) rf_latitude.fit(X, y_latitude) y_rf_longitude_pred = rf_longitude.predict(X) y_rf_latitude_pred = rf_latitude.predict(X) # Draw the pictures plt.figure(figsize=(16, 8)) plt.subplot(1, 2, 1) plt.scatter(xx, y_longitude, color='darkorange', label='data') plt.plot(xx, y_rf_longitude_pred, color='navy', lw=2, label='Random Forest') # plt.plot(X_all, y_gb_longitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Longitude(meter)') #plt.title('Longitude') plt.legend() plt.subplot(1, 2, 2) plt.scatter(xx, y_latitude, color='darkorange', label='data') plt.plot(xx, y_rf_latitude_pred, color='navy', lw=2, label='Random Forest') # plt.plot(X_all, y_gb_latitude_pred, color='c', lw=2, label='Gradient Boosting') plt.xlabel('Date') plt.ylabel('Latitude(meter)') #plt.title('Latitude') plt.legend() plt.tight_layout() # plt.show() plt.savefig(os.path.join(save_path, csv_name.replace('.csv', ''), 'randomforest{}.jpg'.format(key + 1))) plt.close() # Calculate the indexes mse_longitude = mean_squared_error(y_longitude, y_rf_longitude_pred) mse_latitude = mean_squared_error(y_latitude, y_rf_latitude_pred) rmse_longitude = np.sqrt(mse_longitude) rmse_latitude = np.sqrt(mse_latitude) r2_score_longitude = r2_score(y_longitude, y_rf_longitude_pred) r2_score_latitude = r2_score(y_latitude, y_rf_latitude_pred) try: readcsv(save_path + csv_name.replace('.csv','') + '/{}.csv'.format('evaluation_index')) except: savecsv(save_path + csv_name.replace('.csv', '') + '/{}.csv'.format('evaluation_index'), ['mse_longitude', 'mse_latitude', 'rmse_longitude', 'rmse_latitude', 'r2_score_longitude', 'r2_score_latitude']) savecsv(save_path + csv_name.replace('.csv','') + '/{}.csv'.format('evaluation_index'), [mse_longitude,mse_latitude,rmse_longitude,rmse_latitude,r2_score_longitude,r2_score_latitude]) datas = [["X*", "Y*","date_index"]] # datas = [] for i in range(len(date)): datas.append([y_rf_latitude_pred[i], y_rf_longitude_pred[i],date[i]]) # data_write(os.path.join(save_path, csv_name.replace('.csv',''), 'result_{}.xls'.format(key + 1)), datas) savecsvs(os.path.join(save_path, csv_name.replace('.csv',''), 'fitting_result{}.csv'.format(key + 1)), datas) #Coordinate conversion to wgs84 Lon, Lat = projection2wgs84(y_rf_longitude_pred, y_rf_latitude_pred) return Lon, Lat
This function is for Random forests algorithm
Args
save_path
- The path for saving the result file
csv_name
- The species name being processed
key
- The number for file naming
n_estimators
- The number of trees
random_state
- Randomness
Returns
Lon
- The longitude after Random Forests algorithm
Lat
- The latitude after Random Forests algorithm
def rolling_window(initial_data, save_path, csv_name)
-
Expand source code
def rolling_window(initial_data,save_path,csv_name): """ This function is for rolling_window algorithm Args: initial_data: The data after interpolation save_path: The path for saving the result file csv_name: The species name being processed Returns: rolling_window_data_df: The data results after rolling_window """ Rolling_window_data = [] Rolling_window_data.append(["LATITUDE", "LONGITUDE", "OBSERVATION DATE"]) k = 43104 for i in range(k, k + 359): for data in initial_data: if data[2] == "OBSERVATION DATE": continue if -3 < data[2] - i <= 3: Rolling_window_data.append([data[0], data[1], i]) # window_data_df = pd.DataFrame(window_data, columns=False) # window_data_df.to_csv('426.csv', index=False) Rolling_window_data_df = pd.DataFrame(Rolling_window_data[1:], columns=Rolling_window_data[0]) Rolling_window_data_df.to_csv(os.path.join(save_path, csv_name.replace('.csv',''), 'rolling_window_data.csv'), index=False) return Rolling_window_data_df
This function is for rolling_window algorithm
Args
initial_data
- The data after interpolation
save_path
- The path for saving the result file
csv_name
- The species name being processed
Returns
rolling_window_data_df
- The data results after rolling_window
def sldf(x)
-
Expand source code
def sldf(x): """ This function is for outlier detection based on sldf values Args: x: The input data Returns: result: The result after sldf outlier detection """ n = len(x) column = len(x[0]) x_max = np.max(x) x_min = np.min(x) x_ = (x - x_min) / (x_max - x_min) k = 50 lens = 1 / k position_x = np.ceil(x_ / lens) for i in range(len(position_x)): for j in range(len(position_x[0])): if position_x[i][j] == 0: position_x[i][j] = 1 B = np.lexsort([position_x[:, 1], position_x[:, 0]]) A = position_x[B, :] A = A.astype(int) count = np.zeros((k, k)) for i in range(n): count[A[i][0] - 1][A[i][1] - 1] += 1 max_count = np.max(count) q = 2 q = q * max_count w = [0.5, 0.5] dist = np.zeros((n, n)) for i in range(n): dist[:, i] = w[0] * ((x_[:, 0] - x_[i, 0]) ** 2) + w[1] * ((x_[:, 1] - x_[i, 1]) ** 2) dist = np.sqrt(dist) max_dist = np.max(dist) k = max_dist N = [] for i in range(len(dist)): for j in range(len(dist[0])): Ni, Nj = j, i N.append((Ni, Nj)) N = np.array(N) u = np.zeros(n) SLDR = np.zeros(n) N_i = N[:, 0] N_j = N[:, 1] for i in range(n): tmp = np.argwhere(N_j == i) tmp_E = int(max(tmp)) tmp_S = int(min(tmp)) tmp_N = N[tmp_S: tmp_E + 1, :] tmp_D = [] for j in range(len(tmp_N)): a, b = tmp_N[j] tmp_D.append(dist[a, b]) tmp_ji = tmp_E - tmp_S + 1 u[i] = sum(tmp_D) / tmp_ji tmp_c = (tmp_D - u[i]) ** 2 SLDR[i] = sum(tmp_c) / tmp_ji SLDIR = np.zeros(n) for i in range(n): tmp = np.argwhere(N_j == i) tmp_E = int(max(tmp)) tmp_S = int(min(tmp)) tmp = SLDR[N_i[tmp_S: tmp_E + 1]] SLDIR[i] = sum(tmp) / tmp_ji SLDF = SLDR / SLDIR # print(SLDF.shape, x.shape) selected_index = np.argsort(SLDF)[:int(0.8 * len(SLDF) + 1)] # print(selected_index.shape) SLDF_new = SLDF[selected_index] result = np.concatenate((x[selected_index], SLDF_new[:, np.newaxis]), axis=1) return result
This function is for outlier detection based on sldf values
Args
x
- The input data
Returns
result
- The result after sldf outlier detection
def wgs84toprojection(lat, lon)
-
Expand source code
def wgs84toprojection(lat, lon): global projection """ This function is for coordinates conversion Args: lat: The geographical coordinates of latitude lon: The geographical coordinates of longitude Returns: m: The coordinates of latitude after conversion n: The coordinates of longitude after conversion """ crs_WGS84 = CRS.from_epsg(4326) crs_projection = CRS.from_epsg(projection) transformer = Transformer.from_crs(crs_WGS84, crs_projection) m, n = transformer.transform(lat, lon) return n, m