Skip to content
This repository was archived by the owner on Jun 29, 2019. It is now read-only.

Commit 92f1733

Browse files
committed
2 parents 35a2a92 + 51422ec commit 92f1733

9 files changed

Lines changed: 420 additions & 30 deletions

Misc/RSQL/FilestoDownload_R_Walkthrough.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,7 @@ create-db-tb-upload-data.sql
33
PersistModel.sql
44
PredictTipSingleMode.sql
55
PredictTipBatchMode.sql
6-
RSQL_R_Walkthrough.R
6+
fnCalculateDistance.sql
7+
fnEngineerFeatures.sql
8+
RSQL_R_Walkthrough.R
9+
taxiimportfmt.xml

Misc/RSQL/PersistModel.sql

-469 Bytes
Binary file not shown.

Misc/RSQL/PredictTipBatchMode.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ GO
88
SET QUOTED_IDENTIFIER ON
99
GO
1010

11-
IF EXISTS (SELECT * FROM sys.objects WHERE type = 'P' AND name = 'PredictTip')
12-
DROP PROCEDURE PredictTip
11+
IF EXISTS (SELECT * FROM sys.objects WHERE type = 'P' AND name = 'PredictTipBatchMode')
12+
DROP PROCEDURE PredictTipBatchMode
1313
GO
1414

1515
CREATE PROCEDURE [dbo].[PredictTipBatchMode] @inquery nvarchar(max)

Misc/RSQL/RSQL_R_Walkthrough.R

Lines changed: 238 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,245 @@
1+
# PreRequisites: You have installed Revolution R Enterprise 7.5.0 or higher on the machine and SQL Server 2016 CTP3 or higher on the database server
2+
# Install required R libraries for this walkthrough if they are not installed.
3+
4+
if (!('ggmap' %in% rownames(installed.packages()))){
5+
install.packages('ggmap')
6+
}
7+
if (!('mapproj' %in% rownames(installed.packages()))){
8+
install.packages('mapproj')
9+
}
10+
if (!('ROCR' %in% rownames(installed.packages()))){
11+
install.packages('ROCR')
12+
}
13+
if (!('RODBC' %in% rownames(installed.packages()))){
14+
install.packages('RODBC')
15+
}
16+
117
library(RevoScaleR)
2-
# connection string
3-
# currently we use SQL authentication
4-
connStr <- "Driver=SQL Server;Server=<your_server_name.somedomain.com>;Database=<Your_Database_Name>;Uid=<Your_User_Name>;Pwd=<Your_Password>"
518

6-
# set ComputeContext
19+
# Define the connection string
20+
# This walkthrough requires SQL authentication
21+
connStr <- "Driver=SQL Server;Server=<Your_Server_Name.somedomain.com>;Database=<Your_Database_Name>;Uid=<Your_User_Name>;Pwd=<Your_Password>"
22+
23+
# Set ComputeContext. Needs a temp directory path to serialize R objects back and forth
724
sqlShareDir <- paste("C:\\AllShare\\",Sys.getenv("USERNAME"),sep="")
825
sqlWait <- TRUE
926
sqlConsoleOutput <- FALSE
1027
cc <- RxInSqlServer(connectionString = connStr, shareDir = sqlShareDir,
1128
wait = sqlWait, consoleOutput = sqlConsoleOutput)
12-
rxSetComputeContext(cc)
29+
rxSetComputeContext(cc)
30+
31+
#Define a DataSource (from a select query) to be used to explore the data and generate features from.
32+
#Keep in mind that inDataSource is just a reference to the result dataset from the SQL query.
33+
sampleDataQuery <- "select top 1000 tipped, fare_amount, passenger_count,trip_time_in_secs,trip_distance,
34+
pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude, dropoff_longitude,
35+
dropoff_latitude from nyctaxi_joined_1_percent"
36+
37+
38+
inDataSource <- RxSqlServerData(sqlQuery = sampleDataQuery, connectionString = connStr,
39+
colClasses = c(pickup_longitude = "numeric", pickup_latitude = "numeric",
40+
dropoff_longitude = "numeric", dropoff_latitude = "numeric"),
41+
rowsPerRead=500)
42+
43+
################################
44+
# Data exploration #
45+
################################
46+
# Summarize the inDataSource
47+
rxGetVarInfo(data = inDataSource)
48+
start.time <- proc.time()
49+
rxSummary(~fare_amount:F(passenger_count), data = inDataSource)
50+
used.time <- proc.time() - start.time
51+
print(paste("It takes CPU Time=", round(used.time[1]+used.time[2],2)," seconds, Elapsed Time=",
52+
round(used.time[3],2), " seconds to summarize the inDataSource.", sep=""))
53+
54+
################################
55+
# Data Visualization #
56+
################################
57+
options(jupyter.plot_mimetypes = 'image/png')
58+
59+
# Plot fare amount histogram on the SQL Server, and ship the plot to R client to display
60+
start.time <- proc.time()
61+
rxHistogram(~fare_amount, data = inDataSource, title = "Fare Amount Histogram")
62+
used.time <- proc.time() - start.time
63+
print(paste("It takes CPU Time=", round(used.time[1]+used.time[2],2),
64+
" seconds, Elapsed Time=", round(used.time[3],2), " seconds to generate histogram.", sep=""))
65+
66+
# Plot pickup location on map in SQL Server
67+
# Define a function that plots points on a map
68+
mapPlot <- function(inDataSource, googMap){
69+
library(ggmap)
70+
library(mapproj)
71+
72+
# Open Source R functions require data to be brought back in memory into data frames. Use rxImport to bring in data.
73+
# Remember: This whole function runs in the SQL Server Context.
74+
ds <- rxImport(inDataSource)
75+
76+
p<-ggmap(googMap)+
77+
geom_point(aes(x = pickup_longitude, y =pickup_latitude ),
78+
data=ds, alpha =.5, color="darkred", size = 1.5)
79+
80+
return(list(myplot=p))
81+
}
82+
83+
library(ggmap)
84+
library(mapproj)
85+
# Get the map with Times Square, NY as the center. This is run on the R Client
86+
gc <- geocode("Times Square", source = "google")
87+
googMap <- get_googlemap(center = as.numeric(gc), zoom = 12, maptype = 'roadmap', color = 'color')
88+
# Run the points plotting on SQL server. Passing in the map data as arg to remotely executed function.
89+
# The points are in the database and will be plotted on the map
90+
myplots <- rxExec(mapPlot, inDataSource, googMap, timesToRun = 1)
91+
plot(myplots[[1]][["myplot"]])
92+
93+
################################
94+
# Feature engineering #
95+
################################
96+
# Define a function in open source R to calculate the direct distance between pickup and dropoff as a new feature
97+
# Use Haversine Formula: https://en.wikipedia.org/wiki/Haversine_formula
98+
env <- new.env()
99+
100+
env$ComputeDist <- function(pickup_long, pickup_lat, dropoff_long, dropoff_lat){
101+
R <- 6371/1.609344 #radius in mile
102+
delta_lat <- dropoff_lat - pickup_lat
103+
delta_long <- dropoff_long - pickup_long
104+
degrees_to_radians = pi/180.0
105+
a1 <- sin(delta_lat/2*degrees_to_radians)
106+
a2 <- as.numeric(a1)^2
107+
a3 <- cos(pickup_lat*degrees_to_radians)
108+
a4 <- cos(dropoff_lat*degrees_to_radians)
109+
a5 <- sin(delta_long/2*degrees_to_radians)
110+
a6 <- as.numeric(a5)^2
111+
a <- a2+a3*a4*a6
112+
c <- 2*atan2(sqrt(a),sqrt(1-a))
113+
d <- R*c
114+
return (d)
115+
}
116+
117+
#Define the featureDataSource to be used to store the features, specify types of some variables as numeric
118+
featureDataSource = RxSqlServerData(table = "features",
119+
colClasses = c(pickup_longitude = "numeric", pickup_latitude = "numeric",
120+
dropoff_longitude = "numeric", dropoff_latitude = "numeric",
121+
passenger_count = "numeric", trip_distance = "numeric",
122+
trip_time_in_secs = "numeric", direct_distance = "numeric"),
123+
connectionString = connStr)
124+
125+
# Create feature (direct distance) by calling rxDataStep() function, which calls the env$ComputeDist function to process records
126+
# And output it along with other variables as features to the featureDataSource
127+
# This will be the feature set for training machine learning models
128+
start.time <- proc.time()
129+
rxDataStep(inData = inDataSource, outFile = featureDataSource, overwrite = TRUE,
130+
varsToKeep=c("tipped", "fare_amount", "passenger_count","trip_time_in_secs",
131+
"trip_distance", "pickup_datetime", "dropoff_datetime", "pickup_longitude",
132+
"pickup_latitude","dropoff_longitude", "dropoff_latitude"),
133+
transforms = list(direct_distance=ComputeDist(pickup_longitude, pickup_latitude, dropoff_longitude,
134+
dropoff_latitude)),
135+
transformEnvir = env, rowsPerRead=500, reportProgress = 3)
136+
used.time <- proc.time() - start.time
137+
print(paste("It takes CPU Time=", round(used.time[1]+used.time[2],2),
138+
" seconds, Elapsed Time=", round(used.time[3],2), " seconds to generate features.", sep=""))
139+
140+
# Alternatively, use a user defined function in SQL to create features
141+
# Sometimes, feature engineering in SQL might be faster than R
142+
# You need to choose the most efficient way based on real situation
143+
# Here, featureEngineeringQuery is just a reference to the result from a SQL query.
144+
featureEngineeringQuery = "SELECT tipped, fare_amount, passenger_count,trip_time_in_secs,trip_distance,
145+
pickup_datetime, dropoff_datetime,
146+
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) as direct_distance,
147+
pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude
148+
FROM nyctaxi_joined_1_percent
149+
tablesample (1 percent) repeatable (98052)
150+
"
151+
featureDataSource = RxSqlServerData(sqlQuery = featureEngineeringQuery,
152+
colClasses = c(pickup_longitude = "numeric", pickup_latitude = "numeric",
153+
dropoff_longitude = "numeric", dropoff_latitude = "numeric",
154+
passenger_count = "numeric", trip_distance = "numeric",
155+
trip_time_in_secs = "numeric", direct_distance = "numeric"),
156+
connectionString = connStr)
157+
158+
# summarize the feature table after the feature set is created
159+
rxGetVarInfo(data = featureDataSource)
160+
161+
################################
162+
# Training models #
163+
################################
164+
# build classification model to predict tipped or not
165+
system.time(logitObj <- rxLogit(tipped ~ passenger_count + trip_distance + trip_time_in_secs + direct_distance, data = featureDataSource))
166+
summary(logitObj)
167+
168+
################################
169+
# Make predictions #
170+
################################
171+
# predict and write the prediction results back to SQL Server table
172+
scoredOutput <- RxSqlServerData(
173+
connectionString = connStr,
174+
table = "taxiScoreOutput"
175+
)
176+
177+
rxPredict(modelObject = logitObj, data = featureDataSource, outData = scoredOutput,
178+
predVarNames = "Score", type = "response", writeModelVars = TRUE, overwrite = TRUE)
179+
180+
################################
181+
# Model evaluation #
182+
################################
183+
# plot ROC curve from SQL Context
184+
rxRocCurve( "tipped", "Score", scoredOutput)
185+
186+
# Plot accuracy vs threshold
187+
# We demonstrate how to do it on the client using Open source R library (ROCR)
188+
# NOTE: The non Revolution R Enterprise functions ("rx") run locally even if execution context is set to SQL Server
189+
# First of all you need to bring the scored Output data to the client using rxImport
190+
scoredOutput = rxImport(scoredOutput)
191+
192+
library('ROCR')
193+
pred <- prediction(scoredOutput$Score, scoredOutput$tipped)
194+
195+
acc.perf = performance(pred, measure = 'acc')
196+
plot(acc.perf)
197+
ind = which.max( slot(acc.perf, 'y.values')[[1]] )
198+
acc = slot(acc.perf, 'y.values')[[1]][ind]
199+
cutoff = slot(acc.perf, 'x.values')[[1]][ind]
200+
201+
################################
202+
# Model operationalization #
203+
################################
204+
# First, serialize a model and put it into a database table
205+
modelbin <- serialize(logitObj, NULL)
206+
modelbinstr=paste(modelbin, collapse="")
207+
208+
library(RODBC)
209+
conn <- odbcDriverConnect(connStr )
210+
211+
# Persist model by calling a stored procedure from SQL
212+
q<-paste("EXEC PersistModel @m='", modelbinstr,"'", sep="")
213+
sqlQuery (conn, q)
214+
215+
# We have already provided and installed two stored procs to call for prediction on this model - PredictTipBatchMode and PredictTipSingleMode
216+
# predict with stored procedure in batch mode. Take a few records that are not part of training data
217+
# NOTE: You need to generate the distance feature when you extract the records to send for prediction in batch mode
218+
# The following query selects the top 10 observations that are not in training set.
219+
# This query is parsed as an input parameter to a stored procedure PredictTipBatchMode to make predictions
220+
input = "N'select top 10 a.passenger_count as passenger_count,
221+
a.trip_time_in_secs as trip_time_in_secs,
222+
a.trip_distance as trip_distance,
223+
a.dropoff_datetime as dropoff_datetime,
224+
dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude,dropoff_longitude) as direct_distance
225+
from
226+
(
227+
select medallion, hack_license, pickup_datetime, passenger_count,trip_time_in_secs,trip_distance,
228+
dropoff_datetime, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude
229+
from nyctaxi_joined_1_percent
230+
)a
231+
left outer join
232+
(
233+
select medallion, hack_license, pickup_datetime
234+
from nyctaxi_joined_1_percent
235+
tablesample (1 percent) repeatable (98052)
236+
)b
237+
on a.medallion=b.medallion and a.hack_license=b.hack_license and a.pickup_datetime=b.pickup_datetime
238+
where b.medallion is null
239+
'"
240+
q<-paste("EXEC PredictTipBatchMode @inquery = ", input, sep="")
241+
sqlQuery (conn, q)
242+
243+
# Call predict on a single observation
244+
q = "EXEC PredictTipSingleMode 1, 2.5, 631, 40.763958,-73.973373, 40.782139,-73.977303 "
245+
sqlQuery (conn, q)

0 commit comments

Comments
 (0)