You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository was archived by the owner on Jun 29, 2019. It is now read-only.
Copy file name to clipboardExpand all lines: Misc/SQLDW/machine-learning-data-science-process-sqldw-walkthrough.md
+26-26Lines changed: 26 additions & 26 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -167,25 +167,25 @@ In this exercise, we will:
167
167
For a quick verification of the number of rows and columns in the tables populated earlier using parallel bulk import,
168
168
169
169
-- Report number of rows in table <nyctaxi_trip> without table scan
170
-
SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<nyctaxi_trip>')
170
+
SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_trip>')
171
171
172
172
-- Report number of columns in table <nyctaxi_trip>
173
-
SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>'
173
+
SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>' AND table_schema = '<schemaname>'
174
174
175
175
#### Exploration: Trip distribution by medallion
176
176
177
177
This example identifies the medallion (taxi numbers) with more than 100 trips within a given time period. The query would benefit from the partitioned table access since it is conditioned by the partition scheme of **pickup\_datetime**. Querying the full dataset will also make use of the partitioned table and/or index scan.
178
178
179
179
SELECT medallion, COUNT(*)
180
-
FROM <nyctaxi_fare>
180
+
FROM <schemaname>.<nyctaxi_fare>
181
181
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
182
182
GROUP BY medallion
183
183
HAVING COUNT(*) > 100
184
184
185
185
#### Exploration: Trip distribution by medallion and hack_license
186
186
187
187
SELECT medallion, hack_license, COUNT(*)
188
-
FROM <nyctaxi_fare>
188
+
FROM <schemaname>.<nyctaxi_fare>
189
189
WHERE pickup_datetime BETWEEN '20130101' AND '20130131'
190
190
GROUP BY medallion, hack_license
191
191
HAVING COUNT(*) > 100
@@ -194,7 +194,7 @@ This example identifies the medallion (taxi numbers) with more than 100 trips wi
194
194
195
195
This example investigates if any of the longitude and/or latitude fields either contain an invalid value (radian degrees should be between -90 and 90), or have (0, 0) coordinates.
196
196
197
-
SELECT COUNT(*) FROM <nyctaxi_trip>
197
+
SELECT COUNT(*) FROM <schemaname>.<nyctaxi_trip>
198
198
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
199
199
AND (CAST(pickup_longitude AS float) NOT BETWEEN -90 AND 90
200
200
OR CAST(pickup_latitude AS float) NOT BETWEEN -90 AND 90
@@ -209,7 +209,7 @@ This example finds the number of trips that were tipped vs. not tipped in a give
209
209
210
210
SELECT tipped, COUNT(*) AS tip_freq FROM (
211
211
SELECT CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped, tip_amount
212
-
FROM <nyctaxi_fare>
212
+
FROM <schemaname>.<nyctaxi_fare>
213
213
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
214
214
GROUP BY tipped
215
215
@@ -225,7 +225,7 @@ This example computes the distribution of tip ranges in a given time period (or
225
225
WHEN (tip_amount > 10 AND tip_amount <= 20) THEN 3
226
226
ELSE 4
227
227
END AS tip_class
228
-
FROM <nyctaxi_fare>
228
+
FROM <schemaname>.<nyctaxi_fare>
229
229
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
230
230
GROUP BY tip_class
231
231
@@ -269,7 +269,7 @@ This example converts the pickup and drop-off longitude and latitude to SQL geog
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) AS DirectDistance
322
-
FROM <nyctaxi_trip>
322
+
FROM <schemaname>.<nyctaxi_trip>
323
323
WHERE datepart("mi",pickup_datetime)=1
324
324
AND CAST(pickup_latitude AS float) BETWEEN -90 AND 90
325
325
AND CAST(dropoff_latitude AS float) BETWEEN -90 AND 90
@@ -337,7 +337,7 @@ The following query joins the **nyctaxi\_trip** and **nyctaxi\_fare** tables, ge
337
337
WHEN (tip_amount > 10 AND tip_amount <= 20) THEN 3
338
338
ELSE 4
339
339
END AS tip_class
340
-
FROM <nyctaxi_trip> t, <nyctaxi_fare> f
340
+
FROM <schemaname>.<nyctaxi_trip> t, <schemaname>.<nyctaxi_fare> f
341
341
WHERE datepart("mi",t.pickup_datetime) = 1
342
342
AND t.medallion = f.medallion
343
343
AND t.hack_license = f.hack_license
@@ -411,14 +411,14 @@ Initialize your database connection settings in the following variables:
411
411
412
412
nrows = pd.read_sql('''
413
413
SELECT SUM(rows) FROM sys.partitions
414
-
WHERE object_id = OBJECT_ID('<nyctaxi_trip>')
414
+
WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_trip>')
415
415
''', conn)
416
416
417
417
print 'Total number of rows = %d' % nrows.iloc[0,0]
418
418
419
419
ncols = pd.read_sql('''
420
420
SELECT COUNT(*) FROM information_schema.columns
421
-
WHERE table_name = ('<nyctaxi_trip>')
421
+
WHERE table_name = ('<nyctaxi_trip>') AND table_schema = ('<schemaname>')
422
422
''', conn)
423
423
424
424
print 'Total number of columns = %d' % ncols.iloc[0,0]
@@ -430,14 +430,14 @@ Initialize your database connection settings in the following variables:
430
430
431
431
nrows = pd.read_sql('''
432
432
SELECT SUM(rows) FROM sys.partitions
433
-
WHERE object_id = OBJECT_ID('<nyctaxi_fare>')
433
+
WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_fare>')
434
434
''', conn)
435
435
436
436
print 'Total number of rows = %d' % nrows.iloc[0,0]
437
437
438
438
ncols = pd.read_sql('''
439
439
SELECT COUNT(*) FROM information_schema.columns
440
-
WHERE table_name = ('<nyctaxi_fare>')
440
+
WHERE table_name = ('<nyctaxi_fare>') AND table_schema = ('<schemaname>')
441
441
''', conn)
442
442
443
443
print 'Total number of columns = %d' % ncols.iloc[0,0]
@@ -452,7 +452,7 @@ Initialize your database connection settings in the following variables:
452
452
query = '''
453
453
SELECT TOP 10000 t.*, f.payment_type, f.fare_amount, f.surcharge, f.mta_tax,
454
454
f.tolls_amount, f.total_amount, f.tip_amount
455
-
FROM <nyctaxi_trip> t, <nyctaxi_fare> f
455
+
FROM <schemaname>.<nyctaxi_trip> t, <schemaname>.<nyctaxi_fare> f
456
456
WHERE datepart("mi",t.pickup_datetime) = 1
457
457
AND t.medallion = f.medallion
458
458
AND t.hack_license = f.hack_license
@@ -535,17 +535,17 @@ In this section, we explore data distributions using the sampled data which is p
535
535
536
536
#### Exploration: Report number of rows and columns in the sampled table
537
537
538
-
nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('nyctaxi_sample')''', conn)
538
+
nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_sample>')''', conn)
539
539
print 'Number of rows in sample = %d' % nrows.iloc[0,0]
540
540
541
-
ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('nyctaxi_sample')''', conn)
541
+
ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('<nyctaxi_sample>') AND table_schema = '<schemaname>'''', conn)
542
542
print 'Number of columns in sample = %d' % ncols.iloc[0,0]
543
543
544
544
#### Exploration: Tipped/Not Tipped Distribution
545
545
546
546
query = '''
547
547
SELECT tipped, count(*) AS tip_freq
548
-
FROM nyctaxi_sample
548
+
FROM <schemaname>.<nyctaxi_sample>
549
549
GROUP BY tipped
550
550
'''
551
551
@@ -555,7 +555,7 @@ In this section, we explore data distributions using the sampled data which is p
555
555
556
556
query = '''
557
557
SELECT tip_class, count(*) AS tip_freq
558
-
FROM nyctaxi_sample
558
+
FROM <schemaname>.<nyctaxi_sample>
559
559
GROUP BY tip_class
560
560
'''
561
561
@@ -569,7 +569,7 @@ In this section, we explore data distributions using the sampled data which is p
569
569
570
570
query = '''
571
571
SELECT CONVERT(date, dropoff_datetime) AS date, COUNT(*) AS c
572
-
FROM nyctaxi_sample
572
+
FROM <schemaname>.<nyctaxi_sample>
573
573
GROUP BY CONVERT(date, dropoff_datetime)
574
574
'''
575
575
@@ -579,36 +579,36 @@ In this section, we explore data distributions using the sampled data which is p
579
579
580
580
query = '''
581
581
SELECT medallion,count(*) AS c
582
-
FROM nyctaxi_sample
582
+
FROM <schemaname>.<nyctaxi_sample>
583
583
GROUP BY medallion
584
584
'''
585
585
586
586
pd.read_sql(query,conn)
587
587
588
588
#### Exploration: Trip distribution by medallion and Hack License
589
589
590
-
query = '''select medallion, hack_license,count(*) from nyctaxi_sample group by medallion, hack_license'''
590
+
query = '''select medallion, hack_license,count(*) from <schemaname>.<nyctaxi_sample> group by medallion, hack_license'''
591
591
pd.read_sql(query,conn)
592
592
593
593
594
594
#### Exploration: Trip Time Distribution
595
595
596
-
query = '''select trip_time_in_secs, count(*) from nyctaxi_sample group by trip_time_in_secs order by count(*) desc'''
596
+
query = '''select trip_time_in_secs, count(*) from <schemaname>.<nyctaxi_sample> group by trip_time_in_secs order by count(*) desc'''
597
597
pd.read_sql(query,conn)
598
598
599
599
#### Exploration: Trip Distance Distribution
600
600
601
-
query = '''select floor(trip_distance/5)*5 as tripbin, count(*) from nyctaxi_sample group by floor(trip_distance/5)*5 order by count(*) desc'''
601
+
query = '''select floor(trip_distance/5)*5 as tripbin, count(*) from <schemaname>.<nyctaxi_sample> group by floor(trip_distance/5)*5 order by count(*) desc'''
602
602
pd.read_sql(query,conn)
603
603
604
604
#### Exploration: Payment Type Distribution
605
605
606
-
query = '''select payment_type,count(*) from nyctaxi_sample group by payment_type'''
606
+
query = '''select payment_type,count(*) from <schemaname>.<nyctaxi_sample> group by payment_type'''
607
607
pd.read_sql(query,conn)
608
608
609
609
#### Verify the final form of the featurized table
610
610
611
-
query = '''SELECT TOP 100 * FROM nyctaxi_sample'''
611
+
query = '''SELECT TOP 100 * FROM <schemaname>.<nyctaxi_sample>'''
612
612
pd.read_sql(query,conn)
613
613
614
614
We are now ready to proceed to model building and model deployment in [Azure Machine Learning](https://studio.azureml.net). The data is ready for any of the prediction problems identified earlier, namely:
0 commit comments