Coverage for util.py : 100%
133 statements
1import pandas as pd
2import numpy as np
3import reverse_geocoder as rg
4import pycountry
5import string
6import re
7import geopy
8from geopy.geocoders import Nominatim
9import math
11“””
12 Join all research data and all practicum data and generate the final dataset
13 Input:
14 all_research:
15 Pandas.DataFrame() object containing all usable research data.
16 all_practicum:
17 Pandas.DataFrame() object containing all usable practicum data.
18 Output:
19 final:
20 Pandas.DataFrame() object containing all usable data.
21“””
22def join_ds(all_research, all_practicum):
23 #Drop unused columns in practicum datasets
24 all_practicum = all_practicum.drop(‘Domestic or Global ‘, 1)
25 all_practicum = all_practicum.drop(‘Organization’, 1)
26 #Add Columns
27 long_idx = all_practicum.columns.get_loc(“Longitude”)
28 all_practicum.insert(long_idx+1, “Project Title”, “Others”)
29 all_practicum.insert(long_idx+2, “Department”, “Others”)
30 all_practicum.insert(long_idx+3, “Sponsor”, “Others”)
31 all_practicum.head()
33 #Drop unused columns in research datasets
34 col = [‘Location’,‘Project Start Date’, ‘Project End Date’,‘Country Code / FIPs code’,‘FY14’,‘FY15’,‘FY16’,‘FY17’, ‘FY18’, ‘Proposal Number’,‘Principal Investigator’]
35 for i in range(len(col)):
36 all_research = all_research.drop(col[i], 1)
37 #Research dataset & practicum dataset are in the same format
38 #Concatenate the two datasets
39 final = pd.concat([all_research, all_practicum])
40 final.reset_index(drop=True, inplace=True)
41 return final
43“””
44 Read in research datasets, and merge them into one dataset
45 Input:
46 research_datasets: list of strings of research dataset file names
47 example:
48 research_datasets = [‘research_data1.csv’,’research_data2.xls’,…]
49 accept csv and xls format.
50 output:
51 all_research:
52 Pandas.DataFrame() object containing all research data.
53“””
54def process_research(research_datasets):
55 all_research = pd.DataFrame()
56 for i in range(len(research_datasets)):
57 file_type = research_datasets[i].split(‘.’)[1]
58 if file_type == ‘csv’:
59 file = pd.read_csv(research_datasets[i])
60 elif ‘xls’ in file_type:
61 file = pd.read_excel(research_datasets[i])
62 location(file)
63 frames_research = [file, all_research]
64 all_research = pd.concat(frames_research)
65 return all_research
67“””
68 Find rows need to be cleaned and rows containing county information
69 Input:
70 df:
71 Pandas.DataFrame() object that needs to find dirty rows.
72 Output:
73 clean:
74 Pandas.DataFrame() object that contains correct location information
75 to_be_cleaned:
76 Pandas.DataFrame() object that contains incorrect location information that need human correction.
78“””
79def find_dirty(df):
80 to_be_cleaned = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’])
81 clean = pd.DataFrame(columns=[‘City’,‘State’,‘Country’, ‘Organization’])
82 for i, row in df.iterrows():
83 if not isinstance(row[‘City’], str) or row[‘City’] is None or len(row[‘City’]) <1:
84 row[‘City’] = “”
85 if not isinstance(row[‘State’], str) or row[‘State’] is None or len(row[‘State’]) <1:
86 row[‘State’] = “”
87 if not isinstance(row[‘Country’], str) or row[‘Country’] is None or len(row[‘Country’]) <1:
88 row[‘Country’] = “”
89 elif ‘states’ in row[‘Country’].lower():
90 row[‘Country’] = “United States”
91 temp = row[‘City’] +” “+ row[‘State’]+” “+row[‘Country’]
92 x = re.findall(“[.,!?;/&]”, temp)
93 if len(x)>0:
94 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True)
95 elif ” and “ in temp.lower():
96 to_be_cleaned = to_be_cleaned.append(row, ignore_index=True)
97 else:
98 clean = clean.append(row, ignore_index=True)
99 return clean, to_be_cleaned
101“””
102 Process a Pandas.DataFrame to add columns of location information
103 Input:
104 df: Pandas.DataFrame object
105“””
106def location(df):
107 geolocator = Nominatim(user_agent=“523”)
108 df.insert(0, “Type”,“Research”) # Insert column to indicate whether the project is research
109 loc_idx = df.columns.get_loc(“Location”)
110 df.insert(loc_idx+1, “County”, “Others”) #Insert column to store County, if column location contains such information
111 df.insert(loc_idx+2, “City”,“Others”) #Insert column to store Country, if column location contains such information
112 df.insert(loc_idx+3, “State”, “Others”) #Insert column to store State, if column location contains such information
113 df.insert(loc_idx+4, “Country”,“Others”) #Insert column to store Country, if column location contains such information
114 lat_idx = df.columns.get_loc(“Latitude”)
116 for i, row in df.iterrows():
117 temp = row[‘Location’].split(‘,’)
118 #Set Country
119 country = “”
120 if(len(temp) == 1):
121 df.iat[i,loc_idx+4] = temp[0].strip()
122 country = temp[0]
123 #Set State/City Country
124 elif(len(temp) == 2):
125 df.iat[i,loc_idx+4] = temp[1].strip()
126 country = temp[1]
127 if(‘states’ in temp[1].lower()):
128 #If U.S., set state
129 df.iat[i,loc_idx+3] = temp[0].strip()
130 else:
131 #If other countries, set city
132 df.iat[i,loc_idx+2] = temp[0].strip()
133 #Set County, State, Country
134 else:
135 country = temp[2]
136 df.iat[i,loc_idx+1] = temp[0].strip()
137 df.iat[i,loc_idx+3] = temp[1].strip()
138 df.iat[i,loc_idx+4] = temp[2].strip()
139 #Check with Latitude& Longitude correspond to country
140 if not math.isnan(row[‘Latitude’]) and not math.isnan(row[‘Longitude’]):
141 cor = (row[‘Latitude’], row[‘Longitude’])
142 results = rg.search(cor)
143 city = results[0][‘name’]
144 cty = pycountry.countries.get(alpha_2=results[0][‘cc’]).name
145 cty = re.findall(r”[\w’]+|[.,!?;]”, cty)[0]
146 #If the country generated from latitude and longitude does not match with country variable
147 #Replace with approximate latitude and longitude generated on location variable
148 if cty.lower() not in country.strip().lower():
149 location = geolocator.geocode(row[‘Location’])
150 df.iat[i,lat_idx] = location.latitude
151 df.iat[i,lat_idx+1] = location.longitude
152 #If the country matches, fill in information on city based on latitude and longitude
153 else:
154 df.iat[i,loc_idx+2] = city
156“””
157 Process practicum data.
158 Input:
159 df: Pandas.DataFrame() containing data to be processed
160 all_practicum:
161 Pandas.DataFrame(columns = [required columns]) to store usable practicum data
162 exmaple:
163 pd.DataFrame(columns=[‘Type’, ‘County’,’City’,’State’,’Country’,’Latitude’,’Longitude’, ‘Organization’])
164 unusable:
165 Pandas.DataFrame(columns = [required columns]) to store incorrect/unusable practicum data
166 Output:
167 all_practicum
168 unusable
169“””
170def process_practicum(df, all_practicum, unusable):
171 geolocator = Nominatim(user_agent=“523”)
172 df.insert(0, “Type”, “Practicum”)
173 city_idx = df.columns.get_loc(“City”)
174 df.insert(city_idx, “County”, “Others”)
175 city_idx += 1
176 cty_idx = df.columns.get_loc(“Country”)
177 df.insert(cty_idx+1, “Latitude”, math.nan)
178 df.insert(cty_idx+2, “Longitude”, math.nan)
179 for i, row in df.iterrows():
180 city = row[‘City’]
181 state = row[‘State’]
182 country = row[‘Country’]
183 if not isinstance(city, str) or city is None or len(city) <1:
184 city = “”
185 row[‘City’] = “Others”
186 else:
187 row[‘City’] = row[‘City’].strip()
188 if not isinstance(state, str) or state is None or len(state) <1:
189 state = “”
190 row[‘State’] = “Others”
191 else:
192 row[‘State’] = row[‘State’].strip()
193 if not isinstance(state, str) or country is None or len(country) <1:
194 country = “”
195 row[‘Country’] = “Others”
196 else:
197 row[‘Country’] = row[‘Country’].strip()
198 if ‘states’ in row[‘Country’].lower():
199 row[‘Country’] = “United States”
200 temp = city + ‘ ‘+state+‘ ‘+country
201 #Fill approximate latitude and longitude values for practicum data
202 location = geolocator.geocode(temp.strip())
203 if “county” in city.lower() and “states” in country.lower():
204 row[‘County’] = city
205 row[‘City’] = “Others”
206 if location is not None:
207 row[‘Latitude’] = location.latitude
208 row[‘Longitude’] = location.longitude
209 cor = (location.latitude, location.longitude)
210 results = rg.search(cor)
211 if row[‘Country’] == “Others” and results is not None:
212 row[‘Country’] = pycountry.countries.get(alpha_2=results[0][‘cc’]).name
213 all_practicum = all_practicum.append(row, ignore_index=True)
214 else:
215 unusable = unusable.append(row, ignore_index=True)
216 return all_practicum, unusable