|
| 1 | +# NYC Data wrangling using Python and Azure SQL Data Warehouse |
| 2 | + |
| 3 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 4 | +# License Information # |
| 5 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 6 | +# This sample IPython Notebook is shared by Microsoft under the MIT license. |
| 7 | +# Please check the LICENSE.txt file in the directory where this Python script file is stored |
| 8 | +# for license information and additional details. |
| 9 | + |
| 10 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 11 | +# Prerequisites # |
| 12 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 13 | +# Anaconda Python 2.7 |
| 14 | +# Or Python 2.7 and modules including pandas, numpy, matplotlib, time, pyodbc, tables |
| 15 | +# Azure SQL Data Warehouse provisioned |
| 16 | + |
| 17 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 18 | +# Background # |
| 19 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 20 | +# This notebook demonstrates data exploration and feature generation |
| 21 | +# using Python and SQL queries for data stored in Azure SQL Data Warehouse. |
| 22 | +# We start with reading a sample of the data into a Pandas data frame and |
| 23 | +# visualizing and exploring the data. |
| 24 | +# We show how to use Python to execute SQL queries against the data |
| 25 | +# and manipulate data directly within the Azure SQL Data Warehouse. |
| 26 | + |
| 27 | +# This IPNB is accompanying material to the Azure Data Science in Action walkthrough document |
| 28 | +# (https://azure.microsoft.com/en-us/documentation/articles/machine-learning-data-science-process-sqldw-walkthrough/) |
| 29 | +# and uses the New York City Taxi dataset (http://www.andresmh.com/nyctaxitrips/). |
| 30 | + |
| 31 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 32 | +# Step 1: Read data in Pandas frame for visualizations # |
| 33 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 34 | +# We start with loading a sample of the data in a Pandas data frame and performing some explorations on the sample. |
| 35 | +# We join the Trip and Fare data and select the top 10000 rows of the dataset in a Pandas dataframe. |
| 36 | +# We assume that the Trip and Fare tables have been created and loaded to tables in SQL Data Warehouse. |
| 37 | +# If you haven't done this already please refer to the 'Load the data to SQL Data Warehouse' section of this walkthrough. |
| 38 | + |
| 39 | +# Step 1.1. Import required packages in this experiment (no output) |
| 40 | +import pandas as pd |
| 41 | +from pandas import Series, DataFrame |
| 42 | +import numpy as np |
| 43 | +import matplotlib.pyplot as plt |
| 44 | +from time import time |
| 45 | +import pyodbc |
| 46 | +import os |
| 47 | +import tables |
| 48 | +import time |
| 49 | + |
| 50 | +# Step 1.2. Initialize Database Credentials (no output) |
| 51 | +SERVER_NAME = '<server name>' |
| 52 | +DATABASE_NAME = '<database name>' |
| 53 | +USERID = '<user name>' |
| 54 | +PASSWORD = '<password>' |
| 55 | +DB_DRIVER = '<database driver>' |
| 56 | + |
| 57 | +# Step 1.3. Create Data Warehouse Connection (no output) |
| 58 | +CONNECTION_STRING = ';'.join([driver,server,database,uid,pwd, ';TDS_VERSION=7.3;Port=1433']) |
| 59 | +print CONNECTION_STRING |
| 60 | +conn = pyodbc.connect(CONNECTION_STRING) |
| 61 | + |
| 62 | +# Step 1.4. Report number of rows and columns in table <nyctaxi_trip> (outputs numbers of records and columns in trip table) |
| 63 | +nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<nyctaxi_trip>')''', conn) |
| 64 | +print 'Total number of rows = %d' % nrows.iloc[0,0] |
| 65 | + |
| 66 | +ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('<nyctaxi_trip>')''', conn) |
| 67 | +print 'Total number of columns = %d' % ncols.iloc[0,0] |
| 68 | + |
| 69 | +# Step 1.5. Report number of rows and columns in table <nyctaxi_fare> (outputs numbers of records and columns in fare table) |
| 70 | +nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<nyctaxi_fare>')''', conn) |
| 71 | +print 'Total number of rows = %d' % nrows.iloc[0,0] |
| 72 | + |
| 73 | +ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('<nyctaxi_fare>')''', conn) |
| 74 | +print 'Total number of columns = %d' % ncols.iloc[0,0] |
| 75 | + |
| 76 | +# Step 1.6 Read-in data from SQL Data Warehouse (outputs reading time and shape of data read in) |
| 77 | +t0 = time.time() |
| 78 | + |
| 79 | +#load only a small percentage of the joined data for some quick visuals |
| 80 | +df1 = pd.read_sql('''select top 10000 t.*, f.payment_type, f.fare_amount, f.surcharge, f.mta_tax, |
| 81 | + f.tolls_amount, f.total_amount, f.tip_amount |
| 82 | + from <nyctaxi_trip> t, <nyctaxi_fare> f where datepart("mi",t.pickup_datetime)=0 and t.medallion = f.medallion |
| 83 | + and t.hack_license = f.hack_license and t.pickup_datetime = f.pickup_datetime''', conn) |
| 84 | + |
| 85 | +t1 = time.time() |
| 86 | +print 'Time to read the sample table is %f seconds' % (t1-t0) |
| 87 | + |
| 88 | +print 'Number of rows and columns retrieved = (%d, %d)' % (df1.shape[0], df1.shape[1]) |
| 89 | + |
| 90 | +# Step 1.7. Descriptive statistics of the data (outputs statistics of data) |
| 91 | +# Now we can explore the sample data. We start with looking at descriptive statistics for trip distance: |
| 92 | +df1['trip_distance'].describe() |
| 93 | + |
| 94 | +# Step 1.8. Plot the box plot of trip_distance (outputs figures) |
| 95 | +# Next we look at the box plot for trip distance to visualize quantiles |
| 96 | +df1.boxplot(column='trip_distance',return_type='dict') |
| 97 | + |
| 98 | +# Step 1.9. Plot the distribution of trip_distance (outputs figures) |
| 99 | +fig = plt.figure() |
| 100 | +ax1 = fig.add_subplot(1,2,1) |
| 101 | +ax2 = fig.add_subplot(1,2,2) |
| 102 | +df1['trip_distance'].plot(ax=ax1,kind='kde', style='b-') |
| 103 | +df1['trip_distance'].hist(ax=ax2, bins=100, color='k') |
| 104 | + |
| 105 | +# Step 1.10. Put the trip_distance to bins |
| 106 | +trip_dist_bins = [0, 1, 2, 4, 10, 1000] |
| 107 | +df1['trip_distance'] |
| 108 | +trip_dist_bin_id = pd.cut(df1['trip_distance'], trip_dist_bins) |
| 109 | +trip_dist_bin_id |
| 110 | + |
| 111 | +# Step 1.11. Plot the bar and line charts of the trip_distance in bins (outputs figures) |
| 112 | +# The distribution of the trip distance values after binning looks like the following: |
| 113 | +pd.Series(trip_dist_bin_id).value_counts() |
| 114 | +# We can plot the above bin distribution in a bar or line plot as below |
| 115 | +pd.Series(trip_dist_bin_id).value_counts().plot(kind='bar') |
| 116 | +pd.Series(trip_dist_bin_id).value_counts().plot(kind='line') |
| 117 | +# We can also use bar plots for visualizing the sum of passengers for each vendor as follows |
| 118 | +vendor_passenger_sum = df1.groupby('vendor_id').passenger_count.sum() |
| 119 | +print vendor_passenger_sum |
| 120 | +vendor_passenger_sum.plot(kind='bar') |
| 121 | + |
| 122 | +# Step 1.12. Plot the Scatter plot between trip_time_in_secs and trip_distance (output figures) |
| 123 | +# to see whether there is any correlation between them |
| 124 | +plt.scatter(df1['trip_time_in_secs'], df1['trip_distance']) |
| 125 | +# To further drill down on the relationship we can plot distribution side by side |
| 126 | +# with the scatter plot (while flipping independentand dependent variables) as follows |
| 127 | +df1_2col = df1[['trip_time_in_secs','trip_distance']] |
| 128 | +pd.scatter_matrix(df1_2col, diagonal='hist', color='b', alpha=0.7, hist_kwds={'bins':100}) |
| 129 | +# Similarly we can check the relationship between rate_code and trip_distance using a scatter plot |
| 130 | +plt.scatter(df1['passenger_count'], df1['trip_distance']) |
| 131 | + |
| 132 | +# Step 1.13. Calculate the correlation between trip_time_in_secs and trip_distance (outputs correlations between two columns) |
| 133 | +# Pandas 'corr' function can be used to compute the correlation between trip_time_in_secs and trip_distance as follows: |
| 134 | +df1[['trip_time_in_secs', 'trip_distance']].corr() |
| 135 | + |
| 136 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 137 | +# Step 2: Exploring the Sampled Data in SQL Data Warehouse # |
| 138 | +#-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-# |
| 139 | +# In this section we used a sampled table we pregenerated by joining Trip and Fare data and taking a sub-sample of the full dataset. |
| 140 | +# The sample data table named '<nyctaxi_sample>' has been created and the data is loaded when you run the PowerShell script. |
| 141 | +# Step 2.1. Report number of rows and columns in the sampled table (outputs numbers of rows and columns in the sampled data table |
| 142 | +nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<nyctaxi_sample>')''', conn) |
| 143 | +print 'Number of rows in sample = %d' % nrows.iloc[0,0] |
| 144 | + |
| 145 | +ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('<nyctaxi_sample>')''', conn) |
| 146 | +print 'Number of columns in sample = %d' % ncols.iloc[0,0] |
| 147 | + |
| 148 | +# Step 2.2. Check the tipped/not tipped distribution (outputs counts of trips in tipped/not tipped classes) |
| 149 | +query = ''' |
| 150 | + SELECT tipped, count(*) AS tip_freq |
| 151 | + FROM <nyctaxi_sample> |
| 152 | + GROUP BY tipped |
| 153 | + ''' |
| 154 | + |
| 155 | +pd.read_sql(query, conn) |
| 156 | + |
| 157 | +# Step 2.3. Check the tip class (tip_amount) distribution (outputs counts of trips in tip classes) |
| 158 | +query = ''' |
| 159 | + SELECT tip_class, count(*) AS tip_freq |
| 160 | + FROM <nyctaxi_sample> |
| 161 | + GROUP BY tip_class |
| 162 | +''' |
| 163 | + |
| 164 | +tip_class_dist = pd.read_sql(query, conn) |
| 165 | +tip_class_dist |
| 166 | + |
| 167 | +# Step 2.4. Plot the tip distribution by class (outputs figures) |
| 168 | +tip_class_dist['tip_freq'].plot(kind='bar') |
| 169 | + |
| 170 | +# Step 2.5. Count the number of trips each day (outputs a data frame with count of trips in each day) |
| 171 | +query = ''' |
| 172 | + SELECT CONVERT(date, dropoff_datetime) as date, count(*) as c |
| 173 | + from <nyctaxi_sample> |
| 174 | + group by CONVERT(date, dropoff_datetime) |
| 175 | + ''' |
| 176 | +pd.read_sql(query,conn) |
| 177 | + |
| 178 | +# Step 2.6. Count the number of trips per each medallion (outputs a data frame with count of trips by each medallion ID) |
| 179 | +query = '''select medallion,count(*) as c from <nyctaxi_sample> group by medallion''' |
| 180 | +pd.read_sql(query,conn) |
| 181 | + |
| 182 | +# Step 2.7. Count the number of trips per each medallion and license (outputs a data frame) |
| 183 | +query = '''select medallion, hack_license,count(*) from <nyctaxi_sample> group by medallion, hack_license''' |
| 184 | +pd.read_sql(query,conn) |
| 185 | + |
| 186 | +# Step 2.8. Count the number of trips by trip_time_in_secs (outputs a data frame) |
| 187 | +query = '''select trip_time_in_secs, count(*) from <nyctaxi_sample> group by trip_time_in_secs order by count(*) desc''' |
| 188 | +pd.read_sql(query,conn) |
| 189 | + |
| 190 | +# Step 2.9. Count the number of trips by trip_distance (outputs a data frame) |
| 191 | +query = '''select floor(trip_distance/5)*5 as tripbin, count(*) from <nyctaxi_sample> group by floor(trip_distance/5)*5 order by count(*) desc''' |
| 192 | +pd.read_sql(query,conn) |
| 193 | + |
| 194 | +# Step 2.10. Count the number of trips by payment type (outputs a data frame) |
| 195 | +query = '''select payment_type,count(*) from <nyctaxi_sample> group by payment_type''' |
| 196 | +pd.read_sql(query,conn) |
| 197 | + |
| 198 | +# Step 2.11. Read the top 10 observations from the sample table (outputs a data frame) |
| 199 | +query = '''select TOP 10 * from <nyctaxi_sample>''' |
| 200 | +pd.read_sql(query,conn) |
| 201 | + |
| 202 | + |
0 commit comments