Original NYC Taxi ML NotebookΒΆ
%pip install lightgbm shapely
Requirement already satisfied: lightgbm in /conda/lib/python3.7/site-packages (2.3.0)
Requirement already satisfied: shapely in /User/.pythonlibs/jupyter2/lib/python3.7/site-packages (1.7.1)
Requirement already satisfied: scipy in /conda/lib/python3.7/site-packages (from lightgbm) (1.4.1)
Requirement already satisfied: numpy in /conda/lib/python3.7/site-packages (from lightgbm) (1.19.2)
Requirement already satisfied: scikit-learn in /conda/lib/python3.7/site-packages (from lightgbm) (0.23.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /conda/lib/python3.7/site-packages (from scikit-learn->lightgbm) (2.1.0)
Requirement already satisfied: joblib>=0.11 in /conda/lib/python3.7/site-packages (from scikit-learn->lightgbm) (0.17.0)
Note: you may need to restart the kernel to use updated packages.
import numpy as np
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc
import shapely.wkt
from io import StringIO
def clean_df(df):
return df[(df.fare_amount > 0) & (df.fare_amount <= 500) &
(df.PULocationID > 0) & (df.PULocationID <= 263) &
(df.DOLocationID > 0) & (df.DOLocationID <= 263)]
def radian_conv(degree):
"""
Return radian.
"""
return np.radians(degree)
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
"""
Return distance along great radius between pickup and dropoff coordinates.
"""
#Define earth radius (km)
R_earth = 6371
#Convert degrees to radians
pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
[pickup_lat, pickup_lon,
dropoff_lat, dropoff_lon])
#Compute distances along lat, lon dimensions
dlat = dropoff_lat - pickup_lat
dlon = dropoff_lon - pickup_lon
#Compute haversine distance
a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
return 2 * R_earth * np.arcsin(np.sqrt(a))
def add_airport_dist(dataset):
"""
Return minimum distance from pickup or dropoff coordinates to each airport.
JFK: John F. Kennedy International Airport
EWR: Newark Liberty International Airport
LGA: LaGuardia Airport
SOL: Statue of Liberty
NYC: Newyork Central
"""
jfk_coord = (40.639722, -73.778889)
ewr_coord = (40.6925, -74.168611)
lga_coord = (40.77725, -73.872611)
sol_coord = (40.6892,-74.0445) # Statue of Liberty
nyc_coord = (40.7141667,-74.0063889)
pickup_lat = dataset['pickup_latitude']
dropoff_lat = dataset['dropoff_latitude']
pickup_lon = dataset['pickup_longitude']
dropoff_lon = dataset['dropoff_longitude']
pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1])
dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon)
pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon)
pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1])
dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1])
dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1])
dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
dataset['lga_dist'] = pickup_lga + dropoff_lga
dataset['sol_dist'] = pickup_sol + dropoff_sol
dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
return dataset
def add_datetime_info(dataset):
#Convert to datetime format
dataset['pickup_datetime'] = pd.to_datetime(dataset['tpep_pickup_datetime'],format="%Y-%m-%d %H:%M:%S")
dataset['hour'] = dataset.pickup_datetime.dt.hour
dataset['day'] = dataset.pickup_datetime.dt.day
dataset['month'] = dataset.pickup_datetime.dt.month
dataset['weekday'] = dataset.pickup_datetime.dt.weekday
dataset['year'] = dataset.pickup_datetime.dt.year
return dataset
def get_zones_dict(zones_url):
zones_df = pd.read_csv(zones_url)
# Remove unnecessary fields
zones_df.drop(['Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough'], axis=1, inplace=True)
# Convert DF to dictionary
zones_dict = zones_df.set_index('OBJECTID').to_dict('index')
# Add lat/long to each zone
for zone in zones_dict:
shape = shapely.wkt.loads(zones_dict[zone]['the_geom'])
zones_dict[zone]['long'] = shape.centroid.x
zones_dict[zone]['lat'] = shape.centroid.y
return zones_dict
def get_zone_lat(zones_dict, zone_id):
return zones_dict[zone_id]['lat']
def get_zone_long(zones_dict, zone_id):
return zones_dict[zone_id]['long']
zones_dict = get_zones_dict('https://s3.wasabisys.com/iguazio/data/Taxi/taxi_zones.csv')
new_train_df = pd.read_csv('https://s3.wasabisys.com/iguazio/data/Taxi/yellow_tripdata_2019-01_subset.csv')
new_train_df = clean_df(new_train_df)
# This can take a minute or two
new_train_df['pickup_latitude'] = new_train_df.apply(lambda x: get_zone_lat(zones_dict, x['PULocationID']), axis=1 )
new_train_df['pickup_longitude'] = new_train_df.apply(lambda x: get_zone_long(zones_dict, x['PULocationID']), axis=1 )
new_train_df['dropoff_latitude'] = new_train_df.apply(lambda x: get_zone_lat(zones_dict, x['DOLocationID']), axis=1 )
new_train_df['dropoff_longitude'] = new_train_df.apply(lambda x: get_zone_long(zones_dict, x['DOLocationID']), axis=1 )
new_train_df = add_datetime_info(new_train_df)
new_train_df = add_airport_dist(new_train_df)
new_train_df['pickup_latitude'] = radian_conv(new_train_df['pickup_latitude'])
new_train_df['pickup_longitude'] = radian_conv(new_train_df['pickup_longitude'])
new_train_df['dropoff_latitude'] = radian_conv(new_train_df['dropoff_latitude'])
new_train_df['dropoff_longitude'] = radian_conv(new_train_df['dropoff_longitude'])
y = new_train_df['fare_amount']
# Remove unnecessary fields
new_train_df.drop(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'congestion_surcharge', 'improvement_surcharge', 'pickup_datetime',
'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'RatecodeID', 'store_and_fwd_flag',
'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount'],
axis=1, inplace=True, errors='ignore')
x_train,x_test,y_train,y_test = train_test_split(new_train_df,y,random_state=123,test_size=0.10)
del new_train_df
del y
gc.collect()
79
params = {
'boosting_type':'gbdt',
'objective': 'regression',
'nthread': 4,
'num_leaves': 31,
'learning_rate': 0.05,
'max_depth': -1,
'subsample': 0.8,
'bagging_fraction' : 1,
'max_bin' : 5000 ,
'bagging_freq': 20,
'colsample_bytree': 0.6,
'metric': 'rmse',
'min_split_gain': 0.5,
'min_child_weight': 1,
'min_child_samples': 10,
'scale_pos_weight':1,
'zero_as_missing': True,
'seed':0,
'num_rounds':50000
}
train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
del x_train
del y_train
del x_test
del y_test
gc.collect()
/conda/lib/python3.7/site-packages/lightgbm/engine.py:148: UserWarning: Found `num_rounds` in params. Will use it instead of argument
warnings.warn("Found `{}` in params. Will use it instead of argument".format(alias))
/conda/lib/python3.7/site-packages/lightgbm/basic.py:1243: UserWarning: Using categorical_feature in Dataset.
warnings.warn('Using categorical_feature in Dataset.')
Training until validation scores don't improve for 500 rounds
[500] valid_0's rmse: 3.01588
[1000] valid_0's rmse: 2.9836
[1500] valid_0's rmse: 2.97776
[2000] valid_0's rmse: 2.98137
Early stopping, best iteration is:
[1615] valid_0's rmse: 2.97663
732
test_data = StringIO("""
passenger_count,trip_distance,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist
1,0.80,0.711950,-1.291073,0.712059,1.290988,13,1,1,1,2019,47.274013,40.386065,16.975747,26.587155,18.925788
""")
test_df = pd.read_csv(test_data, sep=",")
test_df
passenger_count | trip_distance | pickup_latitude | pickup_longitude | dropoff_latitude | dropoff_longitude | hour | day | month | weekday | year | jfk_dist | ewr_dist | lga_dist | sol_dist | nyc_dist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.8 | 0.71195 | -1.291073 | 0.712059 | 1.290988 | 13 | 1 | 1 | 1 | 2019 | 47.274013 | 40.386065 | 16.975747 | 26.587155 | 18.925788 |
#Predict from test set
prediction = model.predict(test_df, num_iteration = model.best_iteration)
print(prediction)
[9.53629134]