Skip to content
This repository was archived by the owner on Jun 29, 2019. It is now read-only.

Commit 3168158

Browse files
committed
edited by Hang for SQLDW walkthrough
1 parent c8f09d7 commit 3168158

4 files changed

Lines changed: 902 additions & 100 deletions

File tree

Misc/SQLDW/SQLDW_Data_Import.ps1

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ function ReadHostInput(){
33
$StorageAccountKey0 = Read-Host -Prompt 'Input the storage account key' -AsSecureString
44
$StorageAccountKey1 = [System.Runtime.InteropServices.Marshal]::SecureStringToBSTR($StorageAccountKey0)
55
$Script:StorageAccountKey = [System.Runtime.InteropServices.Marshal]::PtrToStringAuto($StorageAccountKey1)
6-
$ContainerName0 = Read-Host -Prompt 'Input your storage account container name to upload the NYC Taxi dataset to. Only letters, numbers, and the dash (-) character'
6+
$ContainerName0 = Read-Host -Prompt 'Input your storage account container name to upload the NYC Taxi dataset to. Only letters (lower case), numbers, and the dash (-) character are allowed'
77
$Script:Server = Read-Host -Prompt 'Input the SQL DW server name'
88
$Script:Database = Read-Host -Prompt 'Input the SQL DW database name'
99
$Script:Username = Read-Host -Prompt 'Input the SQL DW user name'
@@ -420,7 +420,7 @@ try
420420
(gc ./SQLDW_Explorations.ipynb) -replace '<database name>', $Database
421421
(gc ./SQLDW_Explorations.ipynb) -replace '<user name>', $Username
422422
(gc ./SQLDW_Explorations.ipynb) -replace '<password>', $Password
423-
(gc ./SQLDW_Explorations.ipynb) -replace '<database server>', 'SQL Server Native Client 11.0'
423+
(gc ./SQLDW_Explorations.ipynb) -replace '<database driver>', 'SQL Server Native Client 11.0'
424424

425425
}
426426

Misc/SQLDW/SQLDW_Explorations.ipynb

Lines changed: 782 additions & 1 deletion
Large diffs are not rendered by default.

Misc/SQLDW/SQLDW_Explorations.sql

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,21 @@
44
-- Report number of columns in table <nyctaxi_trip>
55
SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>'
66

7+
-- Exploration: Trip distribution by medallion
78
SELECT medallion, COUNT(*)
89
FROM <nyctaxi_fare>
910
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
1011
GROUP BY medallion
1112
HAVING COUNT(*) > 100
1213

14+
-- Exploration: Trip distribution by medallion and hack_license
1315
SELECT medallion, hack_license, COUNT(*)
1416
FROM <nyctaxi_fare>
1517
WHERE pickup_datetime BETWEEN '20130101' AND '20130131'
1618
GROUP BY medallion, hack_license
1719
HAVING COUNT(*) > 100
1820

21+
-- Data Quality Assessment: Verify records with incorrect longitude and/or latitude
1922
SELECT COUNT(*) FROM <nyctaxi_trip>
2023
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
2124
AND (CAST(pickup_longitude AS float) NOT BETWEEN -90 AND 90
@@ -25,12 +28,14 @@
2528
OR (pickup_longitude = '0' AND pickup_latitude = '0')
2629
OR (dropoff_longitude = '0' AND dropoff_latitude = '0'))
2730

31+
-- Exploration: Tipped vs. Not Tipped Trips distribution
2832
SELECT tipped, COUNT(*) AS tip_freq FROM (
2933
SELECT CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped, tip_amount
3034
FROM <nyctaxi_fare>
3135
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
3236
GROUP BY tipped
3337

38+
-- Exploration: Tip Class/Range Distribution
3439
SELECT tip_class, COUNT(*) AS tip_freq FROM (
3540
SELECT CASE
3641
WHEN (tip_amount = 0) THEN 0
@@ -43,7 +48,7 @@
4348
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
4449
GROUP BY tip_class
4550

46-
51+
-- Exploration: Compute and Compare Trip Distance
4752
/****** Object: UserDefinedFunction [dbo].[fnCalculateDistance] ******/
4853
SET ANSI_NULLS ON
4954
GO
@@ -55,8 +60,9 @@
5560
DROP FUNCTION fnCalculateDistance
5661
GO
5762

58-
CREATE FUNCTION [dbo].[fnCalculateDistance] (@Lat1 float, @Long1 float, @Lat2 float, @Long2 float)
5963
-- User-defined function calculate the direct distance between two geographical coordinates.
64+
CREATE FUNCTION [dbo].[fnCalculateDistance] (@Lat1 float, @Long1 float, @Lat2 float, @Long2 float)
65+
6066
RETURNS float
6167
AS
6268
BEGIN
@@ -77,6 +83,7 @@
7783
END
7884
GO
7985

86+
-- Sample query to call the function to create features
8087
SELECT pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude,
8188
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) AS DirectDistance
8289
FROM <nyctaxi_trip>
@@ -85,6 +92,7 @@
8592
AND CAST(dropoff_latitude AS float) BETWEEN -90 AND 90
8693
AND pickup_longitude != '0' AND dropoff_longitude != '0'
8794

95+
-- Preparing Data for Model Building
8896
SELECT t.*, f.payment_type, f.fare_amount, f.surcharge, f.mta_tax, f.tolls_amount, f.total_amount, f.tip_amount,
8997
CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped,
9098
CASE WHEN (tip_amount = 0) THEN 0
@@ -100,7 +108,7 @@
100108
AND t.pickup_datetime = f.pickup_datetime
101109
AND pickup_longitude != '0' AND dropoff_longitude != '0'
102110

103-
111+
-- Persist query results in a sample table
104112
CREATE TABLE <nyctaxi_sample>
105113
WITH
106114
(

0 commit comments

Comments
 (0)