util_py

1import pandas as pd

2import numpy as np

3import reverse_geocoder as rg

4import pycountry

5import string

6import re

7import geopy

8from geopy.geocoders import Nominatim

9import math

11“””

12 Join all research data and all practicum data and generate the final dataset

13 Input:

14 all_research:

15 Pandas.DataFrame() object containing all usable research data.

16 all_practicum:

17 Pandas.DataFrame() object containing all usable practicum data.

18 Output:

19 final:

20 Pandas.DataFrame() object containing all usable data.

21“””

22def join_ds(all_research, all_practicum):

23 #Drop unused columns in practicum datasets

24 all_practicum = all_practicum.drop(‘Domestic or Global ‘, 1)

25 all_practicum = all_practicum.drop(‘Organization’, 1)

26 #Add Columns

27 long_idx = all_practicum.columns.get_loc(“Longitude”)

28 all_practicum.insert(long_idx+1, “Project Title”, “Others”)

29 all_practicum.insert(long_idx+2, “Department”, “Others”)

30 all_practicum.insert(long_idx+3, “Sponsor”, “Others”)

31 all_practicum.head()

33 #Drop unused columns in research datasets

34 col = [‘Location’,‘Project Start Date’, ‘Project End Date’,‘Country Code / FIPs code’,‘FY14’,‘FY15’,‘FY16’,‘FY17’, ‘FY18’, ‘Proposal Number’,‘Principal Investigator’]

35 for i in range(len(col)):

36 all_research = all_research.drop(col[i], 1)

37 #Research dataset & practicum dataset are in the same format

38 #Concatenate the two datasets

39 final = pd.concat([all_research, all_practicum])

40 final.reset_index(drop=True, inplace=True)

41 return final

43“””

44 Read in research datasets, and merge them into one dataset

45 Input:

46 research_datasets: list of strings of research dataset file names

47 example:

48 research_datasets = [‘research_data1.csv’,’research_data2.xls’,…]

49 accept csv and xls format.

50 output:

51 all_research:

52 Pandas.DataFrame() object containing all research data.

53“””

54def process_research(research_datasets):

55 all_research = pd.DataFrame()

56 for i in range(len(research_datasets)):

57 file_type = research_datasets[i].split(‘.’)[1]

58 if file_type == ‘csv’:

59 file = pd.read_csv(research_datasets[i])

60 elif ‘xls’ in file_type:

61 file = pd.read_excel(research_datasets[i])

62 location(file)

63 frames_research = [file, all_research]

64 all_research = pd.concat(frames_research)

65 return all_research

67“””

68 Find rows need to be cleaned and rows containing county information

69 Input:

70 df:

71 Pandas.DataFrame() object that needs to find dirty rows.

72 Output:

73 clean:

74 Pandas.DataFrame() object that contains correct location information

75 to_be_cleaned:

76 Pandas.DataFrame() object that contains incorrect location information that need human correction.

78“””

79def find_dirty(df):

80 to_be_cleaned = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’])

81 clean = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’])

82 for i, row in df.iterrows():

83 if not isinstance(row[‘City’], str) or row[‘City’] is None or len(row[‘City’]) <1:

84 row[‘City’] = “”

85 if not isinstance(row[‘State’], str) or row[‘State’] is None or len(row[‘State’]) <1:

86 row[‘State’] = “”

87 if not isinstance(row[‘Country’], str) or row[‘Country’] is None or len(row[‘Country’]) <1:

88 row[‘Country’] = “”

89 elif ‘states’ in row[‘Country’].lower():

90 row[‘Country’] = “United States”

91 temp = row[‘City’] +” “+ row[‘State’]+” “+row[‘Country’]

92 x = re.findall(“[.,!?;/&]”, temp)

93 if len(x)>0:

94 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True)

95 elif ” and “ in temp.lower():

96 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True)

97 else:

98 clean = clean.append(row, ignore_index=True)

99 return clean, to_be_cleaned

100

101“””

102 Process a Pandas.DataFrame to add columns of location information

103 Input:

104 df: Pandas.DataFrame object

105“””

106def location(df):

107 geolocator = Nominatim(user_agent=“523”)

108 df.insert(0, “Type”,“Research”) # Insert column to indicate whether the project is research

109 loc_idx = df.columns.get_loc(“Location”)

110 df.insert(loc_idx+1, “County”, “Others”) #Insert column to store County, if column location contains such information

111 df.insert(loc_idx+2, “City”,“Others”) #Insert column to store Country, if column location contains such information

112 df.insert(loc_idx+3, “State”, “Others”) #Insert column to store State, if column location contains such information

113 df.insert(loc_idx+4, “Country”,“Others”) #Insert column to store Country, if column location contains such information

114 lat_idx = df.columns.get_loc(“Latitude”)

115

116 for i, row in df.iterrows():

117 temp = row[‘Location’].split(‘,’)

118 #Set Country

119 country = “”

120 if(len(temp) == 1):

121 df.iat[i,loc_idx+4] = temp[0].strip()

122 country = temp[0]

123 #Set State/City Country

124 elif(len(temp) == 2):

125 df.iat[i,loc_idx+4] = temp[1].strip()

126 country = temp[1]

127 if(‘states’ in temp[1].lower()):

128 #If U.S., set state

129 df.iat[i,loc_idx+3] = temp[0].strip()

130 else:

131 #If other countries, set city

132 df.iat[i,loc_idx+2] = temp[0].strip()

133 #Set County, State, Country

134 else:

135 country = temp[2]

136 df.iat[i,loc_idx+1] = temp[0].strip()

137 df.iat[i,loc_idx+3] = temp[1].strip()

138 df.iat[i,loc_idx+4] = temp[2].strip()

139 #Check with Latitude& Longitude correspond to country

140 if not math.isnan(row[‘Latitude’]) and not math.isnan(row[‘Longitude’]):

141 cor = (row[‘Latitude’], row[‘Longitude’])

142 results = rg.search(cor)

143 city = results[0][‘name’]

144 cty = pycountry.countries.get(alpha_2=results[0][‘cc’]).name

145 cty = re.findall(r”[\w’]+|[.,!?;]”, cty)[0]

146 #If the country generated from latitude and longitude does not match with country variable

147 #Replace with approximate latitude and longitude generated on location variable

148 if cty.lower() not in country.strip().lower():

149 location = geolocator.geocode(row[‘Location’])

150 df.iat[i,lat_idx] = location.latitude

151 df.iat[i,lat_idx+1] = location.longitude

152 #If the country matches, fill in information on city based on latitude and longitude

153 else:

154 df.iat[i,loc_idx+2] = city

155

156“””

157 Process practicum data.

158 Input:

159 df: Pandas.DataFrame() containing data to be processed

160 all_practicum:

161 Pandas.DataFrame(columns = [required columns]) to store usable practicum data

162 exmaple:

163 pd.DataFrame(columns=[‘Type’, ‘County’,’City’,’State’,’Country’,’Latitude’,’Longitude’, ‘Organization’])

164 unusable:

165 Pandas.DataFrame(columns = [required columns]) to store incorrect/unusable practicum data

166 Output:

167 all_practicum

168 unusable

169“””

170def process_practicum(df, all_practicum, unusable):

171 geolocator = Nominatim(user_agent=“523”)

172 df.insert(0, “Type”, “Practicum”)

173 city_idx = df.columns.get_loc(“City”)

174 df.insert(city_idx, “County”, “Others”)

175 city_idx += 1

176 cty_idx = df.columns.get_loc(“Country”)

177 df.insert(cty_idx+1, “Latitude”, math.nan)

178 df.insert(cty_idx+2, “Longitude”, math.nan)

179 for i, row in df.iterrows():

180 city = row[‘City’]

181 state = row[‘State’]

182 country = row[‘Country’]

183 if not isinstance(city, str) or city is None or len(city) <1:

184 city = “”

185 row[‘City’] = “Others”

186 else:

187 row[‘City’] = row[‘City’].strip()

188 if not isinstance(state, str) or state is None or len(state) <1:

189 state = “”

190 row[‘State’] = “Others”

191 else:

192 row[‘State’] = row[‘State’].strip()

193 if not isinstance(state, str) or country is None or len(country) <1:

194 country = “”

195 row[‘Country’] = “Others”

196 else:

197 row[‘Country’] = row[‘Country’].strip()

198 if ‘states’ in row[‘Country’].lower():

199 row[‘Country’] = “United States”

200 temp = city + ‘ ‘+state+‘ ‘+country

201 #Fill approximate latitude and longitude values for practicum data

202 location = geolocator.geocode(temp.strip())

203 if “county” in city.lower() and “states” in country.lower():

204 row[‘County’] = city

205 row[‘City’] = “Others”

206 if location is not None:

207 row[‘Latitude’] = location.latitude

208 row[‘Longitude’] = location.longitude

209 cor = (location.latitude, location.longitude)

210 results = rg.search(cor)

211 if row[‘Country’] == “Others” and results is not None:

212 row[‘Country’] = pycountry.countries.get(alpha_2=results[0][‘cc’]).name

213 all_practicum = all_practicum.append(row, ignore_index=True)

214 else:

215 unusable = unusable.append(row, ignore_index=True)

216 return all_practicum, unusable

Coverage for util.py : 100%

133 statements

util_py

Coverage for util.py : 100%

133 statements 133 run 0 missing 0 excluded

133 statements