1+ # PreRequisites: You have installed Revolution R Enterprise 7.5.0 or higher on the machine and SQL Server 2016 CTP3 or higher on the database server
2+ # Install required R libraries for this walkthrough if they are not installed.
3+
4+ if (! (' ggmap' %in% rownames(installed.packages()))){
5+ install.packages(' ggmap' )
6+ }
7+ if (! (' mapproj' %in% rownames(installed.packages()))){
8+ install.packages(' mapproj' )
9+ }
10+ if (! (' ROCR' %in% rownames(installed.packages()))){
11+ install.packages(' ROCR' )
12+ }
13+ if (! (' RODBC' %in% rownames(installed.packages()))){
14+ install.packages(' RODBC' )
15+ }
16+
117library(RevoScaleR )
2- # connection string
3- # currently we use SQL authentication
4- connStr <- " Driver=SQL Server;Server=<your_server_name.somedomain.com>;Database=<Your_Database_Name>;Uid=<Your_User_Name>;Pwd=<Your_Password>"
518
6- # set ComputeContext
19+ # Define the connection string
20+ # This walkthrough requires SQL authentication
21+ connStr <- " Driver=SQL Server;Server=<Your_Server_Name.somedomain.com>;Database=<Your_Database_Name>;Uid=<Your_User_Name>;Pwd=<Your_Password>"
22+
23+ # Set ComputeContext. Needs a temp directory path to serialize R objects back and forth
724sqlShareDir <- paste(" C:\\ AllShare\\ " ,Sys.getenv(" USERNAME" ),sep = " " )
825sqlWait <- TRUE
926sqlConsoleOutput <- FALSE
1027cc <- RxInSqlServer(connectionString = connStr , shareDir = sqlShareDir ,
1128 wait = sqlWait , consoleOutput = sqlConsoleOutput )
12- rxSetComputeContext(cc )
29+ rxSetComputeContext(cc )
30+
31+ # Define a DataSource (from a select query) to be used to explore the data and generate features from.
32+ # Keep in mind that inDataSource is just a reference to the result dataset from the SQL query.
33+ sampleDataQuery <- " select top 1000 tipped, fare_amount, passenger_count,trip_time_in_secs,trip_distance,
34+ pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude, dropoff_longitude,
35+ dropoff_latitude from nyctaxi_joined_1_percent"
36+
37+
38+ inDataSource <- RxSqlServerData(sqlQuery = sampleDataQuery , connectionString = connStr ,
39+ colClasses = c(pickup_longitude = " numeric" , pickup_latitude = " numeric" ,
40+ dropoff_longitude = " numeric" , dropoff_latitude = " numeric" ),
41+ rowsPerRead = 500 )
42+
43+ # ###############################
44+ # Data exploration #
45+ # ###############################
46+ # Summarize the inDataSource
47+ rxGetVarInfo(data = inDataSource )
48+ start.time <- proc.time()
49+ rxSummary(~ fare_amount : F (passenger_count ), data = inDataSource )
50+ used.time <- proc.time() - start.time
51+ print(paste(" It takes CPU Time=" , round(used.time [1 ]+ used.time [2 ],2 )," seconds, Elapsed Time=" ,
52+ round(used.time [3 ],2 ), " seconds to summarize the inDataSource." , sep = " " ))
53+
54+ # ###############################
55+ # Data Visualization #
56+ # ###############################
57+ options(jupyter.plot_mimetypes = ' image/png' )
58+
59+ # Plot fare amount histogram on the SQL Server, and ship the plot to R client to display
60+ start.time <- proc.time()
61+ rxHistogram(~ fare_amount , data = inDataSource , title = " Fare Amount Histogram" )
62+ used.time <- proc.time() - start.time
63+ print(paste(" It takes CPU Time=" , round(used.time [1 ]+ used.time [2 ],2 ),
64+ " seconds, Elapsed Time=" , round(used.time [3 ],2 ), " seconds to generate histogram." , sep = " " ))
65+
66+ # Plot pickup location on map in SQL Server
67+ # Define a function that plots points on a map
68+ mapPlot <- function (inDataSource , googMap ){
69+ library(ggmap )
70+ library(mapproj )
71+
72+ # Open Source R functions require data to be brought back in memory into data frames. Use rxImport to bring in data.
73+ # Remember: This whole function runs in the SQL Server Context.
74+ ds <- rxImport(inDataSource )
75+
76+ p <- ggmap(googMap )+
77+ geom_point(aes(x = pickup_longitude , y = pickup_latitude ),
78+ data = ds , alpha = .5 , color = " darkred" , size = 1.5 )
79+
80+ return (list (myplot = p ))
81+ }
82+
83+ library(ggmap )
84+ library(mapproj )
85+ # Get the map with Times Square, NY as the center. This is run on the R Client
86+ gc <- geocode(" Times Square" , source = " google" )
87+ googMap <- get_googlemap(center = as.numeric(gc ), zoom = 12 , maptype = ' roadmap' , color = ' color' )
88+ # Run the points plotting on SQL server. Passing in the map data as arg to remotely executed function.
89+ # The points are in the database and will be plotted on the map
90+ myplots <- rxExec(mapPlot , inDataSource , googMap , timesToRun = 1 )
91+ plot(myplots [[1 ]][[" myplot" ]])
92+
93+ # ###############################
94+ # Feature engineering #
95+ # ###############################
96+ # Define a function in open source R to calculate the direct distance between pickup and dropoff as a new feature
97+ # Use Haversine Formula: https://en.wikipedia.org/wiki/Haversine_formula
98+ env <- new.env()
99+
100+ env $ ComputeDist <- function (pickup_long , pickup_lat , dropoff_long , dropoff_lat ){
101+ R <- 6371 / 1.609344 # radius in mile
102+ delta_lat <- dropoff_lat - pickup_lat
103+ delta_long <- dropoff_long - pickup_long
104+ degrees_to_radians = pi / 180.0
105+ a1 <- sin(delta_lat / 2 * degrees_to_radians )
106+ a2 <- as.numeric(a1 )^ 2
107+ a3 <- cos(pickup_lat * degrees_to_radians )
108+ a4 <- cos(dropoff_lat * degrees_to_radians )
109+ a5 <- sin(delta_long / 2 * degrees_to_radians )
110+ a6 <- as.numeric(a5 )^ 2
111+ a <- a2 + a3 * a4 * a6
112+ c <- 2 * atan2(sqrt(a ),sqrt(1 - a ))
113+ d <- R * c
114+ return (d )
115+ }
116+
117+ # Define the featureDataSource to be used to store the features, specify types of some variables as numeric
118+ featureDataSource = RxSqlServerData(table = " features" ,
119+ colClasses = c(pickup_longitude = " numeric" , pickup_latitude = " numeric" ,
120+ dropoff_longitude = " numeric" , dropoff_latitude = " numeric" ,
121+ passenger_count = " numeric" , trip_distance = " numeric" ,
122+ trip_time_in_secs = " numeric" , direct_distance = " numeric" ),
123+ connectionString = connStr )
124+
125+ # Create feature (direct distance) by calling rxDataStep() function, which calls the env$ComputeDist function to process records
126+ # And output it along with other variables as features to the featureDataSource
127+ # This will be the feature set for training machine learning models
128+ start.time <- proc.time()
129+ rxDataStep(inData = inDataSource , outFile = featureDataSource , overwrite = TRUE ,
130+ varsToKeep = c(" tipped" , " fare_amount" , " passenger_count" ," trip_time_in_secs" ,
131+ " trip_distance" , " pickup_datetime" , " dropoff_datetime" , " pickup_longitude" ,
132+ " pickup_latitude" ," dropoff_longitude" , " dropoff_latitude" ),
133+ transforms = list (direct_distance = ComputeDist(pickup_longitude , pickup_latitude , dropoff_longitude ,
134+ dropoff_latitude )),
135+ transformEnvir = env , rowsPerRead = 500 , reportProgress = 3 )
136+ used.time <- proc.time() - start.time
137+ print(paste(" It takes CPU Time=" , round(used.time [1 ]+ used.time [2 ],2 ),
138+ " seconds, Elapsed Time=" , round(used.time [3 ],2 ), " seconds to generate features." , sep = " " ))
139+
140+ # Alternatively, use a user defined function in SQL to create features
141+ # Sometimes, feature engineering in SQL might be faster than R
142+ # You need to choose the most efficient way based on real situation
143+ # Here, featureEngineeringQuery is just a reference to the result from a SQL query.
144+ featureEngineeringQuery = " SELECT tipped, fare_amount, passenger_count,trip_time_in_secs,trip_distance,
145+ pickup_datetime, dropoff_datetime,
146+ dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude) as direct_distance,
147+ pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude
148+ FROM nyctaxi_joined_1_percent
149+ tablesample (1 percent) repeatable (98052)
150+ "
151+ featureDataSource = RxSqlServerData(sqlQuery = featureEngineeringQuery ,
152+ colClasses = c(pickup_longitude = " numeric" , pickup_latitude = " numeric" ,
153+ dropoff_longitude = " numeric" , dropoff_latitude = " numeric" ,
154+ passenger_count = " numeric" , trip_distance = " numeric" ,
155+ trip_time_in_secs = " numeric" , direct_distance = " numeric" ),
156+ connectionString = connStr )
157+
158+ # summarize the feature table after the feature set is created
159+ rxGetVarInfo(data = featureDataSource )
160+
161+ # ###############################
162+ # Training models #
163+ # ###############################
164+ # build classification model to predict tipped or not
165+ system.time(logitObj <- rxLogit(tipped ~ passenger_count + trip_distance + trip_time_in_secs + direct_distance , data = featureDataSource ))
166+ summary(logitObj )
167+
168+ # ###############################
169+ # Make predictions #
170+ # ###############################
171+ # predict and write the prediction results back to SQL Server table
172+ scoredOutput <- RxSqlServerData(
173+ connectionString = connStr ,
174+ table = " taxiScoreOutput"
175+ )
176+
177+ rxPredict(modelObject = logitObj , data = featureDataSource , outData = scoredOutput ,
178+ predVarNames = " Score" , type = " response" , writeModelVars = TRUE , overwrite = TRUE )
179+
180+ # ###############################
181+ # Model evaluation #
182+ # ###############################
183+ # plot ROC curve from SQL Context
184+ rxRocCurve( " tipped" , " Score" , scoredOutput )
185+
186+ # Plot accuracy vs threshold
187+ # We demonstrate how to do it on the client using Open source R library (ROCR)
188+ # NOTE: The non Revolution R Enterprise functions ("rx") run locally even if execution context is set to SQL Server
189+ # First of all you need to bring the scored Output data to the client using rxImport
190+ scoredOutput = rxImport(scoredOutput )
191+
192+ library(' ROCR' )
193+ pred <- prediction(scoredOutput $ Score , scoredOutput $ tipped )
194+
195+ acc.perf = performance(pred , measure = ' acc' )
196+ plot(acc.perf )
197+ ind = which.max( slot(acc.perf , ' y.values' )[[1 ]] )
198+ acc = slot(acc.perf , ' y.values' )[[1 ]][ind ]
199+ cutoff = slot(acc.perf , ' x.values' )[[1 ]][ind ]
200+
201+ # ###############################
202+ # Model operationalization #
203+ # ###############################
204+ # First, serialize a model and put it into a database table
205+ modelbin <- serialize(logitObj , NULL )
206+ modelbinstr = paste(modelbin , collapse = " " )
207+
208+ library(RODBC )
209+ conn <- odbcDriverConnect(connStr )
210+
211+ # Persist model by calling a stored procedure from SQL
212+ q <- paste(" EXEC PersistModel @m='" , modelbinstr ," '" , sep = " " )
213+ sqlQuery (conn , q )
214+
215+ # We have already provided and installed two stored procs to call for prediction on this model - PredictTipBatchMode and PredictTipSingleMode
216+ # predict with stored procedure in batch mode. Take a few records that are not part of training data
217+ # NOTE: You need to generate the distance feature when you extract the records to send for prediction in batch mode
218+ # The following query selects the top 10 observations that are not in training set.
219+ # This query is parsed as an input parameter to a stored procedure PredictTipBatchMode to make predictions
220+ input = " N'select top 10 a.passenger_count as passenger_count,
221+ a.trip_time_in_secs as trip_time_in_secs,
222+ a.trip_distance as trip_distance,
223+ a.dropoff_datetime as dropoff_datetime,
224+ dbo.fnCalculateDistance(pickup_latitude, pickup_longitude, dropoff_latitude,dropoff_longitude) as direct_distance
225+ from
226+ (
227+ select medallion, hack_license, pickup_datetime, passenger_count,trip_time_in_secs,trip_distance,
228+ dropoff_datetime, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude
229+ from nyctaxi_joined_1_percent
230+ )a
231+ left outer join
232+ (
233+ select medallion, hack_license, pickup_datetime
234+ from nyctaxi_joined_1_percent
235+ tablesample (1 percent) repeatable (98052)
236+ )b
237+ on a.medallion=b.medallion and a.hack_license=b.hack_license and a.pickup_datetime=b.pickup_datetime
238+ where b.medallion is null
239+ '"
240+ q <- paste(" EXEC PredictTipBatchMode @inquery = " , input , sep = " " )
241+ sqlQuery (conn , q )
242+
243+ # Call predict on a single observation
244+ q = " EXEC PredictTipSingleMode 1, 2.5, 631, 40.763958,-73.973373, 40.782139,-73.977303 "
245+ sqlQuery (conn , q )
0 commit comments