Skip to content
This repository was archived by the owner on Jun 29, 2019. It is now read-only.

Commit e065418

Browse files
committed
Revised schema names in markdown
1 parent 723bc45 commit e065418

1 file changed

Lines changed: 26 additions & 26 deletions

File tree

Misc/SQLDW/machine-learning-data-science-process-sqldw-walkthrough.md

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -167,25 +167,25 @@ In this exercise, we will:
167167
For a quick verification of the number of rows and columns in the tables populated earlier using parallel bulk import,
168168

169169
-- Report number of rows in table <nyctaxi_trip> without table scan
170-
SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<nyctaxi_trip>')
170+
SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_trip>')
171171

172172
-- Report number of columns in table <nyctaxi_trip>
173-
SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>'
173+
SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '<nyctaxi_trip>' AND table_schema = '<schemaname>'
174174

175175
#### Exploration: Trip distribution by medallion
176176

177177
This example identifies the medallion (taxi numbers) with more than 100 trips within a given time period. The query would benefit from the partitioned table access since it is conditioned by the partition scheme of **pickup\_datetime**. Querying the full dataset will also make use of the partitioned table and/or index scan.
178178

179179
SELECT medallion, COUNT(*)
180-
FROM <nyctaxi_fare>
180+
FROM <schemaname>.<nyctaxi_fare>
181181
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
182182
GROUP BY medallion
183183
HAVING COUNT(*) > 100
184184

185185
#### Exploration: Trip distribution by medallion and hack_license
186186

187187
SELECT medallion, hack_license, COUNT(*)
188-
FROM <nyctaxi_fare>
188+
FROM <schemaname>.<nyctaxi_fare>
189189
WHERE pickup_datetime BETWEEN '20130101' AND '20130131'
190190
GROUP BY medallion, hack_license
191191
HAVING COUNT(*) > 100
@@ -194,7 +194,7 @@ This example identifies the medallion (taxi numbers) with more than 100 trips wi
194194

195195
This example investigates if any of the longitude and/or latitude fields either contain an invalid value (radian degrees should be between -90 and 90), or have (0, 0) coordinates.
196196

197-
SELECT COUNT(*) FROM <nyctaxi_trip>
197+
SELECT COUNT(*) FROM <schemaname>.<nyctaxi_trip>
198198
WHERE pickup_datetime BETWEEN '20130101' AND '20130331'
199199
AND (CAST(pickup_longitude AS float) NOT BETWEEN -90 AND 90
200200
OR CAST(pickup_latitude AS float) NOT BETWEEN -90 AND 90
@@ -209,7 +209,7 @@ This example finds the number of trips that were tipped vs. not tipped in a give
209209

210210
SELECT tipped, COUNT(*) AS tip_freq FROM (
211211
SELECT CASE WHEN (tip_amount > 0) THEN 1 ELSE 0 END AS tipped, tip_amount
212-
FROM <nyctaxi_fare>
212+
FROM <schemaname>.<nyctaxi_fare>
213213
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
214214
GROUP BY tipped
215215

@@ -225,7 +225,7 @@ This example computes the distribution of tip ranges in a given time period (or
225225
WHEN (tip_amount > 10 AND tip_amount <= 20) THEN 3
226226
ELSE 4
227227
END AS tip_class
228-
FROM <nyctaxi_fare>
228+
FROM <schemaname>.<nyctaxi_fare>
229229
WHERE pickup_datetime BETWEEN '20130101' AND '20131231') tc
230230
GROUP BY tip_class
231231

@@ -269,7 +269,7 @@ This example converts the pickup and drop-off longitude and latitude to SQL geog
269269

270270
SELECT pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude,
271271
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) AS DirectDistance
272-
FROM <nyctaxi_trip>
272+
FROM <schemaname>.<nyctaxi_trip>
273273
WHERE datepart("mi",pickup_datetime)=1
274274
AND CAST(pickup_latitude AS float) BETWEEN -90 AND 90
275275
AND CAST(dropoff_latitude AS float) BETWEEN -90 AND 90
@@ -319,7 +319,7 @@ Here is an example to call this function to generate features in your SQL query:
319319
-- Sample query to call the function to create features
320320
SELECT pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude,
321321
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) AS DirectDistance
322-
FROM <nyctaxi_trip>
322+
FROM <schemaname>.<nyctaxi_trip>
323323
WHERE datepart("mi",pickup_datetime)=1
324324
AND CAST(pickup_latitude AS float) BETWEEN -90 AND 90
325325
AND CAST(dropoff_latitude AS float) BETWEEN -90 AND 90
@@ -337,7 +337,7 @@ The following query joins the **nyctaxi\_trip** and **nyctaxi\_fare** tables, ge
337337
WHEN (tip_amount > 10 AND tip_amount <= 20) THEN 3
338338
ELSE 4
339339
END AS tip_class
340-
FROM <nyctaxi_trip> t, <nyctaxi_fare> f
340+
FROM <schemaname>.<nyctaxi_trip> t, <schemaname>.<nyctaxi_fare> f
341341
WHERE datepart("mi",t.pickup_datetime) = 1
342342
AND t.medallion = f.medallion
343343
AND t.hack_license = f.hack_license
@@ -411,14 +411,14 @@ Initialize your database connection settings in the following variables:
411411

412412
nrows = pd.read_sql('''
413413
SELECT SUM(rows) FROM sys.partitions
414-
WHERE object_id = OBJECT_ID('<nyctaxi_trip>')
414+
WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_trip>')
415415
''', conn)
416416

417417
print 'Total number of rows = %d' % nrows.iloc[0,0]
418418

419419
ncols = pd.read_sql('''
420420
SELECT COUNT(*) FROM information_schema.columns
421-
WHERE table_name = ('<nyctaxi_trip>')
421+
WHERE table_name = ('<nyctaxi_trip>') AND table_schema = ('<schemaname>')
422422
''', conn)
423423

424424
print 'Total number of columns = %d' % ncols.iloc[0,0]
@@ -430,14 +430,14 @@ Initialize your database connection settings in the following variables:
430430

431431
nrows = pd.read_sql('''
432432
SELECT SUM(rows) FROM sys.partitions
433-
WHERE object_id = OBJECT_ID('<nyctaxi_fare>')
433+
WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_fare>')
434434
''', conn)
435435

436436
print 'Total number of rows = %d' % nrows.iloc[0,0]
437437

438438
ncols = pd.read_sql('''
439439
SELECT COUNT(*) FROM information_schema.columns
440-
WHERE table_name = ('<nyctaxi_fare>')
440+
WHERE table_name = ('<nyctaxi_fare>') AND table_schema = ('<schemaname>')
441441
''', conn)
442442

443443
print 'Total number of columns = %d' % ncols.iloc[0,0]
@@ -452,7 +452,7 @@ Initialize your database connection settings in the following variables:
452452
query = '''
453453
SELECT TOP 10000 t.*, f.payment_type, f.fare_amount, f.surcharge, f.mta_tax,
454454
f.tolls_amount, f.total_amount, f.tip_amount
455-
FROM <nyctaxi_trip> t, <nyctaxi_fare> f
455+
FROM <schemaname>.<nyctaxi_trip> t, <schemaname>.<nyctaxi_fare> f
456456
WHERE datepart("mi",t.pickup_datetime) = 1
457457
AND t.medallion = f.medallion
458458
AND t.hack_license = f.hack_license
@@ -535,17 +535,17 @@ In this section, we explore data distributions using the sampled data which is p
535535

536536
#### Exploration: Report number of rows and columns in the sampled table
537537

538-
nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('nyctaxi_sample')''', conn)
538+
nrows = pd.read_sql('''SELECT SUM(rows) FROM sys.partitions WHERE object_id = OBJECT_ID('<schemaname>.<nyctaxi_sample>')''', conn)
539539
print 'Number of rows in sample = %d' % nrows.iloc[0,0]
540540

541-
ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('nyctaxi_sample')''', conn)
541+
ncols = pd.read_sql('''SELECT count(*) FROM information_schema.columns WHERE table_name = ('<nyctaxi_sample>') AND table_schema = '<schemaname>'''', conn)
542542
print 'Number of columns in sample = %d' % ncols.iloc[0,0]
543543

544544
#### Exploration: Tipped/Not Tipped Distribution
545545

546546
query = '''
547547
SELECT tipped, count(*) AS tip_freq
548-
FROM nyctaxi_sample
548+
FROM <schemaname>.<nyctaxi_sample>
549549
GROUP BY tipped
550550
'''
551551

@@ -555,7 +555,7 @@ In this section, we explore data distributions using the sampled data which is p
555555

556556
query = '''
557557
SELECT tip_class, count(*) AS tip_freq
558-
FROM nyctaxi_sample
558+
FROM <schemaname>.<nyctaxi_sample>
559559
GROUP BY tip_class
560560
'''
561561

@@ -569,7 +569,7 @@ In this section, we explore data distributions using the sampled data which is p
569569

570570
query = '''
571571
SELECT CONVERT(date, dropoff_datetime) AS date, COUNT(*) AS c
572-
FROM nyctaxi_sample
572+
FROM <schemaname>.<nyctaxi_sample>
573573
GROUP BY CONVERT(date, dropoff_datetime)
574574
'''
575575

@@ -579,36 +579,36 @@ In this section, we explore data distributions using the sampled data which is p
579579

580580
query = '''
581581
SELECT medallion,count(*) AS c
582-
FROM nyctaxi_sample
582+
FROM <schemaname>.<nyctaxi_sample>
583583
GROUP BY medallion
584584
'''
585585

586586
pd.read_sql(query,conn)
587587

588588
#### Exploration: Trip distribution by medallion and Hack License
589589

590-
query = '''select medallion, hack_license,count(*) from nyctaxi_sample group by medallion, hack_license'''
590+
query = '''select medallion, hack_license,count(*) from <schemaname>.<nyctaxi_sample> group by medallion, hack_license'''
591591
pd.read_sql(query,conn)
592592

593593

594594
#### Exploration: Trip Time Distribution
595595

596-
query = '''select trip_time_in_secs, count(*) from nyctaxi_sample group by trip_time_in_secs order by count(*) desc'''
596+
query = '''select trip_time_in_secs, count(*) from <schemaname>.<nyctaxi_sample> group by trip_time_in_secs order by count(*) desc'''
597597
pd.read_sql(query,conn)
598598

599599
#### Exploration: Trip Distance Distribution
600600

601-
query = '''select floor(trip_distance/5)*5 as tripbin, count(*) from nyctaxi_sample group by floor(trip_distance/5)*5 order by count(*) desc'''
601+
query = '''select floor(trip_distance/5)*5 as tripbin, count(*) from <schemaname>.<nyctaxi_sample> group by floor(trip_distance/5)*5 order by count(*) desc'''
602602
pd.read_sql(query,conn)
603603

604604
#### Exploration: Payment Type Distribution
605605

606-
query = '''select payment_type,count(*) from nyctaxi_sample group by payment_type'''
606+
query = '''select payment_type,count(*) from <schemaname>.<nyctaxi_sample> group by payment_type'''
607607
pd.read_sql(query,conn)
608608

609609
#### Verify the final form of the featurized table
610610

611-
query = '''SELECT TOP 100 * FROM nyctaxi_sample'''
611+
query = '''SELECT TOP 100 * FROM <schemaname>.<nyctaxi_sample>'''
612612
pd.read_sql(query,conn)
613613

614614
We are now ready to proceed to model building and model deployment in [Azure Machine Learning](https://studio.azureml.net). The data is ready for any of the prediction problems identified earlier, namely:

0 commit comments

Comments
 (0)