From d705112e0ca32073680c36af99a724ecc94f5930 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Fri, 5 Jun 2015 01:47:14 -0700 Subject: [PATCH 01/15] Created more examples on SparkR DataFrames Here are more examples on SparkR DataFrames including creating a SQL context, loading data and simple data manipulation --- examples/src/main/r/0-getting-started.R | 23 ++++++++++ examples/src/main/r/1-data.R | 30 +++++++++++++ examples/src/main/r/2-data-manipulation.R | 51 +++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 examples/src/main/r/0-getting-started.R create mode 100644 examples/src/main/r/1-data.R create mode 100644 examples/src/main/r/2-data-manipulation.R diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R new file mode 100644 index 0000000000000..0c79678e675a9 --- /dev/null +++ b/examples/src/main/r/0-getting-started.R @@ -0,0 +1,23 @@ +# +# Author: Daniel Emaasit (@emaasit) +# Purpose: This script shows how to install SparkR onto your workstation/PC +# and initialize a spark context and a SparkSQL context +# Date: 06/05/2015 +# + + +# Install SparkR from CRAN +install.packages("SparkR") + +## OR Install the dev version from Github +install.packages(devtools) +devtools::install_github("amplab-extras/SparkR-pkg", subdir="pkg") + +# Load SparkR onto your PC +library(SparkR) + +## Initialize SparkContext on your local PC +sc <- sparkR.init(master = "local", appName = "MyApp") + +## Initialize SQLContext +sqlCtx <- SparkRSQL.init(sc) \ No newline at end of file diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R new file mode 100644 index 0000000000000..99e297b94f7db --- /dev/null +++ b/examples/src/main/r/1-data.R @@ -0,0 +1,30 @@ +# +# Author: Daniel Emaasit (@emaasit) +# Purpose: This script shows how to create Spark DataFrames +# Date: 06/05/2015 +# + +# For this example, we shall use the "flights" dataset +# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv +# The dataset consists of every flight departing Houston in 2011. +# The data set is made up of 227,496 rows x 14 columns. + +source("0-getting-started.R") + +# Create an R data frame and then convert it to a SparkR DataFrame ------- + +## Create R dataframe +install.packages("data.table") #We want to use the fread() function to read the dataset +library(data.table) + +flights_df <- fread("flights.csv") +flights_df$date <- as.Date(flights_df$date) + +## Convert the local data frame into a SparkR DataFrame +flightsDF <- createDataFrame(sqlCtx, flights_df) + +## Print the schema of this Spark DataFrame +printSchema(flightsDF) + +## Cache the DataFrame +cache(flightsDF) \ No newline at end of file diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R new file mode 100644 index 0000000000000..ee9fa66d8829b --- /dev/null +++ b/examples/src/main/r/2-data-manipulation.R @@ -0,0 +1,51 @@ +# +# Author: Daniel Emaasit (@emaasit) +# Purpose: This script shows how to explore and manipulate Spark DataFrames +# Date: 06/05/2015 +# + +source("1-data.R") + + +# Install the magrittr pipeline operator +install.packages("magrittr") +library(magrittr) + +# Print the first 6 rows of the DataFrame +showDF(flightsDF, numRows = 6) ## Or +head(flightsDF) + +# Show the column names in the DataFrame +columns(flightsDF) + +# Show the number of rows in the DataFrame +count(flightsDF) + +# Show summary statistics for numeric colums +Describe(flightsDF) + +# Select specific columns +destDF <- select(flightsDF, "dest", "cancelled") + +# Using SQL to select columns of data +# First, register the flights DataFrame as a table +registerTempTable(flightsDF, "flightsTable") +destDF <- sql(sqlCtx, "SELECT dest, cancelled FROM flightsTable") + +# Use collect to create a local R data frame +dest_df <- collect(destDF) + +# Print the newly created local data frame +print(dest_df) + +# Filter flights whose destination is JFK +jfkDF <- filter(flightsDF, "dest == JFK") ##OR +jfkDF <- filter(flightsDF, flightsDF$dest == JFK) + +# Group the flights by date and then find the average daily delay +# Write the result into a DataFrame +groupBy(flightsDF, "date") %>% + avg(dep_delay = "avg", arr_delay = "avg") -> dailyDelayDF + +# Stop the SparkContext now +sparkR.stop() From 486f44ee2689e0d98bfc08a759339e6d6b80602d Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Fri, 5 Jun 2015 10:51:32 -0700 Subject: [PATCH 02/15] Added the Apache License at the file --- examples/src/main/r/0-getting-started.R | 28 +++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R index 0c79678e675a9..d7a6d37a118f0 100644 --- a/examples/src/main/r/0-getting-started.R +++ b/examples/src/main/r/0-getting-started.R @@ -1,17 +1,19 @@ # -# Author: Daniel Emaasit (@emaasit) -# Purpose: This script shows how to install SparkR onto your workstation/PC -# and initialize a spark context and a SparkSQL context -# Date: 06/05/2015 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # - - -# Install SparkR from CRAN -install.packages("SparkR") - -## OR Install the dev version from Github -install.packages(devtools) -devtools::install_github("amplab-extras/SparkR-pkg", subdir="pkg") # Load SparkR onto your PC library(SparkR) @@ -20,4 +22,4 @@ library(SparkR) sc <- sparkR.init(master = "local", appName = "MyApp") ## Initialize SQLContext -sqlCtx <- SparkRSQL.init(sc) \ No newline at end of file +sqlCtx <- SparkRSQL.init(sc) From 2e8f72442b7e643bd9ac0f21db1f3b0200dc84e8 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Fri, 5 Jun 2015 10:53:05 -0700 Subject: [PATCH 03/15] Added the Apache License at the top of the file --- examples/src/main/r/1-data.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R index 99e297b94f7db..20cf6912fbda7 100644 --- a/examples/src/main/r/1-data.R +++ b/examples/src/main/r/1-data.R @@ -1,7 +1,18 @@ # -# Author: Daniel Emaasit (@emaasit) -# Purpose: This script shows how to create Spark DataFrames -# Date: 06/05/2015 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # # For this example, we shall use the "flights" dataset @@ -27,4 +38,4 @@ flightsDF <- createDataFrame(sqlCtx, flights_df) printSchema(flightsDF) ## Cache the DataFrame -cache(flightsDF) \ No newline at end of file +cache(flightsDF) From 275b787d8023dbf4771706f70bfe52bff5a078fc Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Fri, 5 Jun 2015 10:53:59 -0700 Subject: [PATCH 04/15] Added the Apache License at the top of the file --- examples/src/main/r/2-data-manipulation.R | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R index ee9fa66d8829b..f8c86e41c57e3 100644 --- a/examples/src/main/r/2-data-manipulation.R +++ b/examples/src/main/r/2-data-manipulation.R @@ -1,7 +1,18 @@ # -# Author: Daniel Emaasit (@emaasit) -# Purpose: This script shows how to explore and manipulate Spark DataFrames -# Date: 06/05/2015 +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # source("1-data.R") From 2653573ed51561ec708d1b795162467c78b1d1de Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 18:58:35 -0700 Subject: [PATCH 05/15] Updates to a comment and variable name Now using sqlContext as the variable name --- examples/src/main/r/0-getting-started.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R index d7a6d37a118f0..82b2c55f2832a 100644 --- a/examples/src/main/r/0-getting-started.R +++ b/examples/src/main/r/0-getting-started.R @@ -15,11 +15,11 @@ # limitations under the License. # -# Load SparkR onto your PC +# Load SparkR library into your R session library(SparkR) ## Initialize SparkContext on your local PC sc <- sparkR.init(master = "local", appName = "MyApp") ## Initialize SQLContext -sqlCtx <- SparkRSQL.init(sc) +sqlContext <- SparkRSQL.init(sc) From 8e0fe14677a0bffa0912861665db5b777740e1cb Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:31:42 -0700 Subject: [PATCH 06/15] provided two options for creating DataFrames provided two options for creating DataFrames. Option 1: from local data frames and option 2: directly create DataFrames using read.df function --- examples/src/main/r/1-data.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R index 20cf6912fbda7..5160ff045608b 100644 --- a/examples/src/main/r/1-data.R +++ b/examples/src/main/r/1-data.R @@ -22,7 +22,7 @@ source("0-getting-started.R") -# Create an R data frame and then convert it to a SparkR DataFrame ------- +# Option 1: Create an R data frame and then convert it to a SparkR DataFrame ------- ## Create R dataframe install.packages("data.table") #We want to use the fread() function to read the dataset @@ -32,10 +32,13 @@ flights_df <- fread("flights.csv") flights_df$date <- as.Date(flights_df$date) ## Convert the local data frame into a SparkR DataFrame -flightsDF <- createDataFrame(sqlCtx, flights_df) +flightsDF <- createDataFrame(sqlContext, flights_df) -## Print the schema of this Spark DataFrame +# Option 2: Alternatively, directly create a SparkR DataFrame from the source data +flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true") + +# Print the schema of this Spark DataFrame printSchema(flightsDF) -## Cache the DataFrame +# Cache the DataFrame cache(flightsDF) From c6933af25816114f7fd4098c65c222dbedb5763f Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:33:29 -0700 Subject: [PATCH 07/15] changed variable name to SQLContext --- examples/src/main/r/2-data-manipulation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R index f8c86e41c57e3..e0013a29bddb7 100644 --- a/examples/src/main/r/2-data-manipulation.R +++ b/examples/src/main/r/2-data-manipulation.R @@ -41,7 +41,7 @@ destDF <- select(flightsDF, "dest", "cancelled") # Using SQL to select columns of data # First, register the flights DataFrame as a table registerTempTable(flightsDF, "flightsTable") -destDF <- sql(sqlCtx, "SELECT dest, cancelled FROM flightsTable") +destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable") # Use collect to create a local R data frame dest_df <- collect(destDF) From cc55cd8ab3b26a418061a625f7500cd67328ba49 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:43:16 -0700 Subject: [PATCH 08/15] combined all the code into one .R file Deleted the source() function and combined all the code into one file --- examples/src/main/r/2-data-manipulation.R | 37 ++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R index e0013a29bddb7..5a96eb268208d 100644 --- a/examples/src/main/r/2-data-manipulation.R +++ b/examples/src/main/r/2-data-manipulation.R @@ -15,7 +15,42 @@ # limitations under the License. # -source("1-data.R") + +# Load SparkR library into your R session +library(SparkR) + +## Initialize SparkContext on your local PC +sc <- sparkR.init(master = "local", appName = "MyApp") + +## Initialize SQLContext +sqlContext <- SparkRSQL.init(sc) + +# For this example, we shall use the "flights" dataset +# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv +# The dataset consists of every flight departing Houston in 2011. +# The data set is made up of 227,496 rows x 14 columns. + + +# Option 1: Create an R data frame and then convert it to a SparkR DataFrame ------- + +## Create R dataframe +install.packages("data.table") #We want to use the fread() function to read the dataset +library(data.table) + +flights_df <- fread("flights.csv") +flights_df$date <- as.Date(flights_df$date) + +## Convert the local data frame into a SparkR DataFrame +flightsDF <- createDataFrame(sqlContext, flights_df) + +# Option 2: Alternatively, directly create a SparkR DataFrame from the source data +flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true") + +# Print the schema of this Spark DataFrame +printSchema(flightsDF) + +# Cache the DataFrame +cache(flightsDF) # Install the magrittr pipeline operator From b95a103f95fca521d8e79a4fb65ba5dfd368e528 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:43:36 -0700 Subject: [PATCH 09/15] Deleted this file --- examples/src/main/r/1-data.R | 44 ------------------------------------ 1 file changed, 44 deletions(-) delete mode 100644 examples/src/main/r/1-data.R diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R deleted file mode 100644 index 5160ff045608b..0000000000000 --- a/examples/src/main/r/1-data.R +++ /dev/null @@ -1,44 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# For this example, we shall use the "flights" dataset -# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv -# The dataset consists of every flight departing Houston in 2011. -# The data set is made up of 227,496 rows x 14 columns. - -source("0-getting-started.R") - -# Option 1: Create an R data frame and then convert it to a SparkR DataFrame ------- - -## Create R dataframe -install.packages("data.table") #We want to use the fread() function to read the dataset -library(data.table) - -flights_df <- fread("flights.csv") -flights_df$date <- as.Date(flights_df$date) - -## Convert the local data frame into a SparkR DataFrame -flightsDF <- createDataFrame(sqlContext, flights_df) - -# Option 2: Alternatively, directly create a SparkR DataFrame from the source data -flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true") - -# Print the schema of this Spark DataFrame -printSchema(flightsDF) - -# Cache the DataFrame -cache(flightsDF) From 90565dd817a28d8df9920b9b1eeadb92d730bfc5 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:44:44 -0700 Subject: [PATCH 10/15] Deleted the getting-started file Deleted the getting started file and combined all the code into one file --- examples/src/main/r/0-getting-started.R | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 examples/src/main/r/0-getting-started.R diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R deleted file mode 100644 index 82b2c55f2832a..0000000000000 --- a/examples/src/main/r/0-getting-started.R +++ /dev/null @@ -1,25 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Load SparkR library into your R session -library(SparkR) - -## Initialize SparkContext on your local PC -sc <- sparkR.init(master = "local", appName = "MyApp") - -## Initialize SQLContext -sqlContext <- SparkRSQL.init(sc) From b6603e341d7d315143d9c97116dd04f234623284 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 19:50:26 -0700 Subject: [PATCH 11/15] changed "Describe" function to "describe" --- examples/src/main/r/2-data-manipulation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R index 5a96eb268208d..887f3425e8412 100644 --- a/examples/src/main/r/2-data-manipulation.R +++ b/examples/src/main/r/2-data-manipulation.R @@ -68,7 +68,7 @@ columns(flightsDF) count(flightsDF) # Show summary statistics for numeric colums -Describe(flightsDF) +describe(flightsDF) # Select specific columns destDF <- select(flightsDF, "dest", "cancelled") From 33f988259f7b3a363c97f630f2d60c331626155f Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Sun, 7 Jun 2015 20:03:09 -0700 Subject: [PATCH 12/15] Renamed file Renamed file to data-manipulation.R --- .../src/main/r/{2-data-manipulation.R => data-manipulation.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/src/main/r/{2-data-manipulation.R => data-manipulation.R} (100%) diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/data-manipulation.R similarity index 100% rename from examples/src/main/r/2-data-manipulation.R rename to examples/src/main/r/data-manipulation.R From a550f70a8faf4556d8b592227254daccf20c9196 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Mon, 8 Jun 2015 11:31:52 -0700 Subject: [PATCH 13/15] Used base R functions Replaced the data.table function (fread) with base R function for reading csv files (read.csv) --- examples/src/main/r/data-manipulation.R | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 887f3425e8412..2822905b2b200 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -19,7 +19,7 @@ # Load SparkR library into your R session library(SparkR) -## Initialize SparkContext on your local PC +## Initialize SparkContext sc <- sparkR.init(master = "local", appName = "MyApp") ## Initialize SQLContext @@ -31,19 +31,16 @@ sqlContext <- SparkRSQL.init(sc) # The data set is made up of 227,496 rows x 14 columns. -# Option 1: Create an R data frame and then convert it to a SparkR DataFrame ------- +# Option 1: Create a local R data frame and then convert it to a SparkR DataFrame ------- -## Create R dataframe -install.packages("data.table") #We want to use the fread() function to read the dataset -library(data.table) - -flights_df <- fread("flights.csv") +## Create a local R dataframe +flights_df <- read.csv("flights.csv") flights_df$date <- as.Date(flights_df$date) ## Convert the local data frame into a SparkR DataFrame flightsDF <- createDataFrame(sqlContext, flights_df) -# Option 2: Alternatively, directly create a SparkR DataFrame from the source data +# Option 2: Alternatively, directly create a SparkR DataFrame from the source data ------- flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true") # Print the schema of this Spark DataFrame @@ -52,11 +49,6 @@ printSchema(flightsDF) # Cache the DataFrame cache(flightsDF) - -# Install the magrittr pipeline operator -install.packages("magrittr") -library(magrittr) - # Print the first 6 rows of the DataFrame showDF(flightsDF, numRows = 6) ## Or head(flightsDF) @@ -88,6 +80,9 @@ print(dest_df) jfkDF <- filter(flightsDF, "dest == JFK") ##OR jfkDF <- filter(flightsDF, flightsDF$dest == JFK) +# Install the magrittr library +library(magrittr) + # Group the flights by date and then find the average daily delay # Write the result into a DataFrame groupBy(flightsDF, "date") %>% From f7227f9935830197f9aa07b337884860c146bec3 Mon Sep 17 00:00:00 2001 From: "Daniel Emaasit (PhD Student)" Date: Tue, 9 Jun 2015 02:59:34 -0700 Subject: [PATCH 14/15] Using command line arguments Taking in data set as a command line argument --- examples/src/main/r/data-manipulation.R | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 2822905b2b200..201bf4493208a 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -20,28 +20,37 @@ library(SparkR) ## Initialize SparkContext -sc <- sparkR.init(master = "local", appName = "MyApp") +sc <- sparkR.init(appName = "SparkR-data-manipulation-example") ## Initialize SQLContext sqlContext <- SparkRSQL.init(sc) # For this example, we shall use the "flights" dataset -# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv # The dataset consists of every flight departing Houston in 2011. # The data set is made up of 227,496 rows x 14 columns. -# Option 1: Create a local R data frame and then convert it to a SparkR DataFrame ------- +args <- commandArgs(trailing = TRUE) +if (length(args) != 1) { + print("Usage: data-manipulation.R Date: Wed, 10 Jun 2015 04:08:51 -0700 Subject: [PATCH 15/15] Used fewer rows for createDataFrame To create a SparkR DataFrame, I used fewer rows of the local data frame. --- examples/src/main/r/data-manipulation.R | 34 ++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R index 201bf4493208a..dcf8fd588e5eb 100644 --- a/examples/src/main/r/data-manipulation.R +++ b/examples/src/main/r/data-manipulation.R @@ -15,22 +15,21 @@ # limitations under the License. # +# For this example, we shall use the "flights" dataset +# The dataset consists of every flight departing Houston in 2011. +# The data set is made up of 227,496 rows x 14 columns. # Load SparkR library into your R session library(SparkR) +args <- commandArgs(trailing = TRUE) + ## Initialize SparkContext sc <- sparkR.init(appName = "SparkR-data-manipulation-example") ## Initialize SQLContext -sqlContext <- SparkRSQL.init(sc) +sqlContext <- sparkRSQL.init(sc) -# For this example, we shall use the "flights" dataset -# The dataset consists of every flight departing Houston in 2011. -# The data set is made up of 227,496 rows x 14 columns. - - -args <- commandArgs(trailing = TRUE) if (length(args) != 1) { print("Usage: data-manipulation.R