Skip to content

util_py

1import pandas as pd 

2import numpy as np 

3import reverse_geocoder as rg 

4import pycountry 

5import string 

6import re 

7import geopy 

8from geopy.geocoders import Nominatim 

9import math 

10 

11“”” 

12 Join all research data and all practicum data and generate the final dataset 

13 Input: 

14 all_research:  

15 Pandas.DataFrame() object containing all usable research data. 

16 all_practicum: 

17 Pandas.DataFrame() object containing all usable practicum data. 

18 Output: 

19 final: 

20 Pandas.DataFrame() object containing all usable data. 

21“”” 

22def join_ds(all_research, all_practicum): 

23 #Drop unused columns in practicum datasets 

24 all_practicum = all_practicum.drop(‘Domestic or Global ‘, 1) 

25 all_practicum = all_practicum.drop(‘Organization’, 1) 

26 #Add Columns 

27 long_idx = all_practicum.columns.get_loc(“Longitude”) 

28 all_practicum.insert(long_idx+1, “Project Title”, “Others”) 

29 all_practicum.insert(long_idx+2, “Department”, “Others”) 

30 all_practicum.insert(long_idx+3, “Sponsor”, “Others”) 

31 all_practicum.head() 

32 

33 #Drop unused columns in research datasets 

34 col = [‘Location’,‘Project Start Date’, ‘Project End Date’,‘Country Code / FIPs code’,‘FY14’,‘FY15’,‘FY16’,‘FY17’, ‘FY18’, ‘Proposal Number’,‘Principal Investigator’] 

35 for i in range(len(col)): 

36 all_research = all_research.drop(col[i], 1) 

37 #Research dataset & practicum dataset are in the same format 

38 #Concatenate the two datasets 

39 final = pd.concat([all_research, all_practicum]) 

40 final.reset_index(drop=True, inplace=True) 

41 return final 

42 

43“”” 

44 Read in research datasets, and merge them into one dataset 

45 Input:  

46 research_datasets: list of strings of research dataset file names 

47 example: 

48 research_datasets = [‘research_data1.csv’,’research_data2.xls’,…] 

49 accept csv and xls format. 

50 output: 

51 all_research: 

52 Pandas.DataFrame() object containing all research data. 

53“”” 

54def process_research(research_datasets): 

55 all_research = pd.DataFrame() 

56 for i in range(len(research_datasets)): 

57 file_type = research_datasets[i].split(‘.’)[1] 

58 if file_type == ‘csv’: 

59 file = pd.read_csv(research_datasets[i]) 

60 elif ‘xls’ in file_type: 

61 file = pd.read_excel(research_datasets[i]) 

62 location(file) 

63 frames_research = [file, all_research] 

64 all_research = pd.concat(frames_research) 

65 return all_research 

66 

67“”” 

68 Find rows need to be cleaned and rows containing county information 

69 Input: 

70 df:  

71 Pandas.DataFrame() object that needs to find dirty rows. 

72 Output: 

73 clean: 

74 Pandas.DataFrame() object that contains correct location information 

75 to_be_cleaned: 

76 Pandas.DataFrame() object that contains incorrect location information that need human correction. 

77 

78“”” 

79def find_dirty(df): 

80 to_be_cleaned = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’]) 

81 clean = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’]) 

82 for i, row in df.iterrows(): 

83 if not isinstance(row[‘City’], str) or row[‘City’] is None or len(row[‘City’]) <1: 

84 row[‘City’] = “” 

85 if not isinstance(row[‘State’], str) or row[‘State’] is None or len(row[‘State’]) <1: 

86 row[‘State’] = “” 

87 if not isinstance(row[‘Country’], str) or row[‘Country’] is None or len(row[‘Country’]) <1: 

88 row[‘Country’] = “” 

89 elif ‘states’ in row[‘Country’].lower(): 

90 row[‘Country’] = “United States” 

91 temp = row[‘City’] +” “+ row[‘State’]+” “+row[‘Country’] 

92 x = re.findall(“[.,!?;/&]”, temp) 

93 if len(x)>0: 

94 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True) 

95 elif ” and “ in temp.lower(): 

96 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True) 

97 else: 

98 clean = clean.append(row, ignore_index=True) 

99 return clean, to_be_cleaned 

100 

101“”” 

102 Process a Pandas.DataFrame to add columns of location information 

103 Input: 

104 df: Pandas.DataFrame object 

105“”” 

106def location(df): 

107 geolocator = Nominatim(user_agent=“523”) 

108 df.insert(0, “Type”,“Research”) # Insert column to indicate whether the project is research 

109 loc_idx = df.columns.get_loc(“Location”) 

110 df.insert(loc_idx+1, “County”, “Others”) #Insert column to store County, if column location contains such information 

111 df.insert(loc_idx+2, “City”,“Others”) #Insert column to store Country, if column location contains such information 

112 df.insert(loc_idx+3, “State”, “Others”) #Insert column to store State, if column location contains such information 

113 df.insert(loc_idx+4, “Country”,“Others”) #Insert column to store Country, if column location contains such information 

114 lat_idx = df.columns.get_loc(“Latitude”) 

115 

116 for i, row in df.iterrows(): 

117 temp = row[‘Location’].split(‘,’) 

118 #Set Country 

119 country = “” 

120 if(len(temp) == 1): 

121 df.iat[i,loc_idx+4] = temp[0].strip() 

122 country = temp[0] 

123 #Set State/City Country 

124 elif(len(temp) == 2): 

125 df.iat[i,loc_idx+4] = temp[1].strip() 

126 country = temp[1] 

127 if(‘states’ in temp[1].lower()): 

128 #If U.S., set state 

129 df.iat[i,loc_idx+3] = temp[0].strip() 

130 else: 

131 #If other countries, set city 

132 df.iat[i,loc_idx+2] = temp[0].strip() 

133 #Set County, State, Country 

134 else: 

135 country = temp[2] 

136 df.iat[i,loc_idx+1] = temp[0].strip() 

137 df.iat[i,loc_idx+3] = temp[1].strip() 

138 df.iat[i,loc_idx+4] = temp[2].strip() 

139 #Check with Latitude& Longitude correspond to country 

140 if not math.isnan(row[‘Latitude’]) and not math.isnan(row[‘Longitude’]): 

141 cor = (row[‘Latitude’], row[‘Longitude’]) 

142 results = rg.search(cor) 

143 city = results[0][‘name’] 

144 cty = pycountry.countries.get(alpha_2=results[0][‘cc’]).name 

145 cty = re.findall(r”[\w’]+|[.,!?;]”, cty)[0] 

146 #If the country generated from latitude and longitude does not match with country variable 

147 #Replace with approximate latitude and longitude generated on location variable 

148 if cty.lower() not in country.strip().lower(): 

149 location = geolocator.geocode(row[‘Location’]) 

150 df.iat[i,lat_idx] = location.latitude 

151 df.iat[i,lat_idx+1] = location.longitude 

152 #If the country matches, fill in information on city based on latitude and longitude 

153 else: 

154 df.iat[i,loc_idx+2] = city 

155 

156“”” 

157 Process practicum data. 

158 Input: 

159 df: Pandas.DataFrame() containing data to be processed 

160 all_practicum:  

161 Pandas.DataFrame(columns = [required columns]) to store usable practicum data 

162 exmaple: 

163 pd.DataFrame(columns=[‘Type’, ‘County’,’City’,’State’,’Country’,’Latitude’,’Longitude’, ‘Organization’]) 

164 unusable:  

165 Pandas.DataFrame(columns = [required columns]) to store incorrect/unusable practicum data 

166 Output: 

167 all_practicum 

168 unusable 

169“”” 

170def process_practicum(df, all_practicum, unusable): 

171 geolocator = Nominatim(user_agent=“523”) 

172 df.insert(0, “Type”, “Practicum”) 

173 city_idx = df.columns.get_loc(“City”) 

174 df.insert(city_idx, “County”, “Others”) 

175 city_idx += 1 

176 cty_idx = df.columns.get_loc(“Country”) 

177 df.insert(cty_idx+1, “Latitude”, math.nan) 

178 df.insert(cty_idx+2, “Longitude”, math.nan) 

179 for i, row in df.iterrows(): 

180 city = row[‘City’] 

181 state = row[‘State’] 

182 country = row[‘Country’] 

183 if not isinstance(city, str) or city is None or len(city) <1: 

184 city = “” 

185 row[‘City’] = “Others” 

186 else: 

187 row[‘City’] = row[‘City’].strip() 

188 if not isinstance(state, str) or state is None or len(state) <1: 

189 state = “” 

190 row[‘State’] = “Others” 

191 else: 

192 row[‘State’] = row[‘State’].strip() 

193 if not isinstance(state, str) or country is None or len(country) <1: 

194 country = “” 

195 row[‘Country’] = “Others” 

196 else: 

197 row[‘Country’] = row[‘Country’].strip() 

198 if ‘states’ in row[‘Country’].lower(): 

199 row[‘Country’] = “United States” 

200 temp = city + ‘ ‘+state+‘ ‘+country 

201 #Fill approximate latitude and longitude values for practicum data 

202 location = geolocator.geocode(temp.strip()) 

203 if “county” in city.lower() and “states” in country.lower(): 

204 row[‘County’] = city 

205 row[‘City’] = “Others” 

206 if location is not None: 

207 row[‘Latitude’] = location.latitude 

208 row[‘Longitude’] = location.longitude 

209 cor = (location.latitude, location.longitude) 

210 results = rg.search(cor) 

211 if row[‘Country’] == “Others” and results is not None: 

212 row[‘Country’] = pycountry.countries.get(alpha_2=results[0][‘cc’]).name 

213 all_practicum = all_practicum.append(row, ignore_index=True) 

214 else: 

215 unusable = unusable.append(row, ignore_index=True) 

216 return all_practicum, unusable