|
4 | 4 | -- Report number of columns in table <nyctaxi_trip> |
5 | 5 | SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>' |
6 | 6 |
|
| 7 | + -- Exploration: Trip distribution by medallion |
7 | 8 | SELECT medallion, COUNT(*) |
8 | 9 | FROM <nyctaxi_fare> |
9 | 10 | WHERE pickup_datetime BETWEEN '20130101' AND '20130331' |
10 | 11 | GROUP BY medallion |
11 | 12 | HAVING COUNT(*) > 100 |
12 | 13 |
|
| 14 | + -- Exploration: Trip distribution by medallion and hack_license |
13 | 15 | SELECT medallion, hack_license, COUNT(*) |
14 | 16 | FROM <nyctaxi_fare> |
15 | 17 | WHERE pickup_datetime BETWEEN '20130101' AND '20130131' |
16 | 18 | GROUP BY medallion, hack_license |
17 | 19 | HAVING COUNT(*) > 100 |
18 | 20 |
|
| 21 | + -- Data Quality Assessment: Verify records with incorrect longitude and/or latitude |
19 | 22 | SELECT COUNT(*) FROM <nyctaxi_trip> |
20 | 23 | WHERE pickup_datetime BETWEEN '20130101' AND '20130331' |
21 | 24 | AND (CAST(pickup_longitude AS float) NOT BETWEEN -90 AND 90 |
|
25 | 28 | OR (pickup_longitude = '0' AND pickup_latitude = '0') |
26 | 29 | OR (dropoff_longitude = '0' AND dropoff_latitude = '0')) |
27 | 30 |
|
| 31 | + -- Exploration: Tipped vs. Not Tipped Trips distribution |
28 | 32 | SELECT tipped, COUNT(*) AS tip_freq FROM ( |
29 | 33 | SELECT CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped, tip_amount |
30 | 34 | FROM <nyctaxi_fare> |
31 | 35 | WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc |
32 | 36 | GROUP BY tipped |
33 | 37 |
|
| 38 | + -- Exploration: Tip Class/Range Distribution |
34 | 39 | SELECT tip_class, COUNT(*) AS tip_freq FROM ( |
35 | 40 | SELECT CASE |
36 | 41 | WHEN (tip_amount = 0) THEN 0 |
|
43 | 48 | WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc |
44 | 49 | GROUP BY tip_class |
45 | 50 |
|
46 | | - |
| 51 | + -- Exploration: Compute and Compare Trip Distance |
47 | 52 | /****** Object: UserDefinedFunction [dbo].[fnCalculateDistance] ******/ |
48 | 53 | SET ANSI_NULLS ON |
49 | 54 | GO |
|
55 | 60 | DROP FUNCTION fnCalculateDistance |
56 | 61 | GO |
57 | 62 |
|
58 | | - CREATE FUNCTION [dbo].[fnCalculateDistance] (@Lat1 float, @Long1 float, @Lat2 float, @Long2 float) |
59 | 63 | -- User-defined function calculate the direct distance between two geographical coordinates. |
| 64 | + CREATE FUNCTION [dbo].[fnCalculateDistance] (@Lat1 float, @Long1 float, @Lat2 float, @Long2 float) |
| 65 | + |
60 | 66 | RETURNS float |
61 | 67 | AS |
62 | 68 | BEGIN |
|
77 | 83 | END |
78 | 84 | GO |
79 | 85 |
|
| 86 | + -- Sample query to call the function to create features |
80 | 87 | SELECT pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude, |
81 | 88 | dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) AS DirectDistance |
82 | 89 | FROM <nyctaxi_trip> |
|
85 | 92 | AND CAST(dropoff_latitude AS float) BETWEEN -90 AND 90 |
86 | 93 | AND pickup_longitude != '0' AND dropoff_longitude != '0' |
87 | 94 |
|
| 95 | + -- Preparing Data for Model Building |
88 | 96 | SELECT t.*, f.payment_type, f.fare_amount, f.surcharge, f.mta_tax, f.tolls_amount, f.total_amount, f.tip_amount, |
89 | 97 | CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped, |
90 | 98 | CASE WHEN (tip_amount = 0) THEN 0 |
|
100 | 108 | AND t.pickup_datetime = f.pickup_datetime |
101 | 109 | AND pickup_longitude != '0' AND dropoff_longitude != '0' |
102 | 110 |
|
103 | | - |
| 111 | + -- Persist query results in a sample table |
104 | 112 | CREATE TABLE <nyctaxi_sample> |
105 | 113 | WITH |
106 | 114 | ( |
|
0 commit comments