# Hyo Sung Kim and Chin Yee Lee
# MUSA 620 - Final Project
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import geopandas as gpd
import cartopy.crs as ccrs
from shapely.geometry import Point
# Importing Uber Data
# 2014
uber_apr14 = pd.read_csv("./UberData/uber-raw-data-apr14.csv")
uber_may14 = pd.read_csv("./UberData/uber-raw-data-may14.csv")
uber_jun14 = pd.read_csv("./UberData/uber-raw-data-jun14.csv")
uber_jul14 = pd.read_csv("./UberData/uber-raw-data-jul14.csv")
uber_aug14 = pd.read_csv("./UberData/uber-raw-data-aug14.csv")
uber_sep14 = pd.read_csv("./UberData/uber-raw-data-sep14.csv")
# Combined df should have x rows:
uber_apr14.shape[0]+uber_may14.shape[0]+uber_jun14.shape[0]+uber_jul14.shape[0]+uber_aug14.shape[0]+uber_sep14.shape[0]
# Combine 2014 data into one dataframe
frames = [uber_apr14, uber_may14, uber_jun14, uber_jul14, uber_aug14, uber_sep14]
uber_2014 = pd.concat(frames)
print(uber_2014.shape)
uber_2014.head()
# Rename columns(Date/Time to DateTime)
uber_2014.columns = ['DateTime', 'Lat', 'Lon', 'Base']
# Add Type Column
uber_2014['Type'] = 'Uber'
# Add Hour Column
uber_2014['Hour'] = uber_2014['DateTime'].str[9:]
uber_2014['Hour'] = uber_2014['Hour'].str[:-6]
uber_2014.Hour=uber_2014.Hour.astype(int)
# Check
uber_2014.head()
#uber_2014.Hour.unique()
uber_2014.head()
#Add Date Column
date=uber_2014['DateTime'].str.rsplit(" ", n=1, expand=True)
date=date.reset_index()
date['date']=date[0]
date=date[['date']]
date.head()
import datetime as dt
date['date']= date['date'].apply(lambda x:
dt.datetime.strptime(x, '%m/%d/%Y'))
date['date']=date['date'].apply(lambda x:
dt.datetime.strftime(x,'%m/%d/%Y'))
#d.strftime('%d-%m-%Y')
date.head()
uber_2014=pd.concat([uber_2014.reset_index(drop=True), date.reset_index(drop=True)], axis=1)
uber_2014.head()
# Select Columns of Interest
uber_cols = ['DateTime', 'Lat', 'Lon', 'Type', 'Hour', 'date']
uber_2014 = uber_2014[uber_cols]
uber_2014.head()
# Create Month Column
uber_2014['Month']=uber_2014['DateTime'].str[0]
uber_2014.Month=uber_2014.Month.astype(int)
# Check
uber_2014.head()
#uber_2014.Month.unique()
# Create Month Name Column
uber_2014.loc[uber_2014.Month == 4,'MonthName'] = 'April'
uber_2014.loc[uber_2014.Month == 5,'MonthName'] = 'May'
uber_2014.loc[uber_2014.Month == 6,'MonthName'] = 'June'
uber_2014.loc[uber_2014.Month == 7,'MonthName'] = 'July'
uber_2014.loc[uber_2014.Month == 8,'MonthName'] = 'August'
uber_2014.loc[uber_2014.Month == 9,'MonthName'] = 'September'
# Check
uber_2014.head()
#uber_2014.MonthName.unique()
# Import NYC Taxi data via API call
# not using Green cab data (for now)
import requests
import json
#!pip install sodapy
from sodapy import Socrata
# If API Token is needed:
#Name: NYC_Picks
#Description: MUSA620-Project
#App Token: Jh3PX6F3BDgnCmENslklEj3iZ
#takes about +/- 5~10 minutes
#source: https://dev.socrata.com/foundry/data.cityofnewyork.us/gkne-dk5s
# Unauthenticated client only works with public data sets. Note 'None' in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)
# Get 4.5M results (~similar to uber_2014), returned as JSON from API / converted to Python list of dictionaries by sodapy.
results = client.get("gkne-dk5s", select="vendor_id, pickup_datetime, pickup_longitude, pickup_latitude",where="pickup_datetime between '2014-04-01T00:00:00.000' and '2014-10-01T00:00:00.000'",limit=4500000)
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
taxi_2014 = results_df
taxi_2014.head()
# Rename columns(Date/Time to DateTime)
taxi_2014.columns = ['DateTime', 'Lat', 'Lon', 'Vendor_ID']
# Add Type Column
taxi_2014['Type'] = 'Taxi'
# Add Hour Column
taxi_2014['Hour'] = taxi_2014['DateTime'].str[11:13]
taxi_2014.Hour=taxi_2014.Hour.astype(int)
# Check
taxi_2014.head()
#taxi_2014.Hour.unique()
# Select Columns of Interest
taxi_cols = ['DateTime', 'Lat', 'Lon', 'Type', 'Hour']
taxi_2014 = taxi_2014[taxi_cols]
taxi_2014.head()
#Add Date Column
date=taxi_2014['DateTime'].str.rsplit("T", n=1, expand=True)
date=date.reset_index()
date['date']=date[0]
date=date[['date']]
date.head()
import datetime as dt
date['date']= date['date'].apply(lambda x:
dt.datetime.strptime(x, '%Y-%m-%d'))
date.head()
date['date']=date['date'].apply(lambda x:
dt.datetime.strftime(x,'%m/%d/%Y'))
#d.strftime('%d-%m-%Y')
date.head()
taxi_2014=pd.concat([taxi_2014.reset_index(drop=True), date.reset_index(drop=True)], axis=1)
taxi_2014.head()
# Select Columns of Interest
taxi_cols = ['DateTime', 'Lat', 'Lon', 'Type', 'Hour', 'date']
taxi_2014 = taxi_2014[taxi_cols]
taxi_2014.head()
# Create Month Column
taxi_2014['Month']=taxi_2014['DateTime'].str[5:7]
taxi_2014.Month=taxi_2014.Month.astype(int)
# Check
taxi_2014.head()
#taxi_2014.Month.unique()
# Create Month Name Column
taxi_2014.loc[taxi_2014.Month == 4,'MonthName'] = 'April'
taxi_2014.loc[taxi_2014.Month == 5,'MonthName'] = 'May'
taxi_2014.loc[taxi_2014.Month == 6,'MonthName'] = 'June'
taxi_2014.loc[taxi_2014.Month == 7,'MonthName'] = 'July'
taxi_2014.loc[taxi_2014.Month == 8,'MonthName'] = 'August'
taxi_2014.loc[taxi_2014.Month == 9,'MonthName'] = 'September'
# Check
taxi_2014.head()
#taxi_2014.MonthName.unique()
#print(taxi_2014['MonthName'].value_counts())
# Make Lat Lon Columns to Numeric
taxi_2014['Lat']=pd.to_numeric(taxi_2014['Lat'])
taxi_2014['Lon']=pd.to_numeric(taxi_2014['Lon'])
taxi_2014.head()
# Check and Compare Uber and Taxi df
uber_2014.head()
taxi_2014.head()
# Comparing Uber vs Taxi Count Trips Between April - September 2014
# Combine Uber and Taxi
# make copies first
uber_byMonth = uber_2014
taxi_byMonth = taxi_2014
# Append, ignore index
uber_taxi_byMonth = uber_byMonth.append(taxi_byMonth, ignore_index=True)
uber_taxi_byMonth.head()
uber_taxi_cols=['Type', 'date', 'Month', 'Hour']
uber_taxi_byMonth=uber_taxi_byMonth[uber_taxi_cols]
uber_taxi_byMonth.head()
uber_taxi_byMonth['dateCount']=0
uber_taxi_byMonth=uber_taxi_byMonth.groupby(['Month','date','Type'])['dateCount'].count()
uber_taxi_byMonth.head()
uber_taxi_byMonth = uber_taxi_byMonth.reset_index()
uber_taxi_byMonth.head()
import altair as alt
source = uber_taxi_byMonth
color = alt.Scale(domain=('Taxi', 'Uber'),
range=["Gold", "Black"])
slider = alt.binding_range(min=4,
max=9,
step = 1)
select_Month = alt.selection_single(name="Month", fields=['Month'], bind=slider)
base=alt.Chart(source, title='Total number of Taxi and Uber Trips Per Day | April 2014 - September 2014').encode(
x=alt.X('date:N', title=None),
y=alt.Y('dateCount:Q', scale=alt.Scale(domain=(0, 45000))),
color=alt.Color('Type:N', scale=color),
tooltip=['Type', 'date', 'dateCount']
).properties(
width=1000
).add_selection(
select_Month
).transform_filter(
select_Month
)
chart1=base.mark_line()
chart1= chart1.configure_axisX(labels=False)
chart1
chart1.save("chart1.html")
import seaborn as sns
import altair as alt
alt.renderers.enable('notebook')
uber_byMonth = uber_2014
taxi_byMonth = taxi_2014
uber_taxi_byMonth = uber_byMonth.append(taxi_byMonth, ignore_index=True)
uber_taxi_byMonth.head()
# Workaround: Order by Month Number
# To do:
# Change colors for type
# Add Title
# ...
# Avg Pick Ups by Hour (Counts) - Avg is done later on
# make copies first
uber_byHourC = uber_2014
taxi_byHourC = taxi_2014
# Append, ignore index
uber_taxi_byHourC = uber_byHourC.append(taxi_byHourC, ignore_index=True)
uber_taxi_byHourC.head()
uber_taxi_byHourC.shape
# Groupby Hour and Type to get Hourly pickup counts for each type of transportation
uber_taxi_byHourC['HourCount']=0
uber_taxi_byHourC=uber_taxi_byHourC.groupby(['Hour','Type'])['HourCount'].count()
uber_taxi_byHourC.head()
# Reset Index
uber_taxi_byHourC = uber_taxi_byHourC.reset_index()
uber_taxi_byHourC.head() #should have 24 hours months, 2 types - total of 48 rows
uber_taxi_byHourC['AverageHourCount']=uber_taxi_byHourC.HourCount/183
uber_taxi_byHourC.head()
#get the total vehicles by hour
uber_taxi_byHourC_HourTotal=uber_taxi_byHourC.groupby(['Hour'])[['AverageHourCount']].sum()
uber_taxi_byHourC_HourTotal['HourTotal']=uber_taxi_byHourC_HourTotal['AverageHourCount']
uber_taxi_byHourC_HourTotal=uber_taxi_byHourC_HourTotal.reset_index()
uber_taxi_byHourC_HourTotal=uber_taxi_byHourC_HourTotal[['Hour', 'HourTotal']]
uber_taxi_byHourC_HourTotal
uber_taxi_byHourC=pd.merge(uber_taxi_byHourC, uber_taxi_byHourC_HourTotal, on='Hour')
uber_taxi_byHourC.head()
uber_taxi_byHourC['HourProp']=uber_taxi_byHourC.AverageHourCount/uber_taxi_byHourC.HourTotal
uber_taxi_byHourC.head()
# Line Plot of Pick Up Counts by Hour by Type with Interactive Bar Plot on the bottom
from altair import Chart, Color, Scale
brush = alt.selection(type='interval', encodings=['x'])
# top line graph
points2 = alt.Chart(uber_taxi_byHourC, title='Average number of pick-ups by hour').mark_line().encode(
Color('Type:N',
scale=Scale(domain=['Taxi', 'Uber'],
range=['gold', 'black'])),
x='Hour:N',
y='AverageHourCount:Q',
#color='Type:N',
tooltip=['Hour:Q', 'Type:N', 'AverageHourCount:Q']
).properties(
selection=brush,
width=800
)
# the bottom bar plot 2
bars2 = alt.Chart(title='Average total number of pick-ups').mark_bar().encode(
y='Type:N',
color='Type:N',
x='sum(AverageHourCount):Q'
)
text = bars2.mark_text(
align='left',
baseline='middle',
dx=3 # Nudges text to right so it doesn't appear on top of the bar
).encode(
text='sum(AverageHourCount):Q'
)
bars2b=(bars2 + text).properties(width=600).transform_filter(brush.ref())
# the bottom bar plot
bars3= alt.Chart(title='Proportion of taxi and uber pick-ups').mark_bar().encode(
x="Hour:N",
y=alt.Y("HourProp:Q", stack="normalize"),
color="Type:N",
tooltip=['Hour:Q', 'Type:N', 'HourProp:Q']
).transform_filter(
brush.ref()
)
chart2 = alt.vconcat(points2, bars2b, bars3, data=uber_taxi_byHourC, center=True) # vertical stacking
chart2
# Save to later embed in HMTL file
chart2.save("chart2.html")
# Traffic Data
traffic = pd.read_csv("Traffic_Volume_Counts__2012-2013.csv")
traffic.head()
# Groupby SegmentID and sum by hourly counts
# Join by Neighborhoods?
# Fishnet/Hex Bin Map?
# Heat Map?
uber_2014.head()
taxi_2014.head()