From d705112e0ca32073680c36af99a724ecc94f5930 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Fri, 5 Jun 2015 01:47:14 -0700
Subject: [PATCH 01/15] Created more examples on SparkR DataFrames

Here are more examples on SparkR DataFrames including creating a SQL
context, loading data and simple data manipulation
---
 examples/src/main/r/0-getting-started.R   | 23 ++++++++++
 examples/src/main/r/1-data.R              | 30 +++++++++++++
 examples/src/main/r/2-data-manipulation.R | 51 +++++++++++++++++++++++
 3 files changed, 104 insertions(+)
 create mode 100644 examples/src/main/r/0-getting-started.R
 create mode 100644 examples/src/main/r/1-data.R
 create mode 100644 examples/src/main/r/2-data-manipulation.R

diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R
new file mode 100644
index 0000000000000..0c79678e675a9
--- /dev/null
+++ b/examples/src/main/r/0-getting-started.R
@@ -0,0 +1,23 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to install SparkR onto your workstation/PC
+#          and initialize a spark context and a SparkSQL context
+# Date:    06/05/2015
+#
+
+
+# Install SparkR from CRAN
+install.packages("SparkR")
+
+## OR Install the dev version from Github
+install.packages(devtools)
+devtools::install_github("amplab-extras/SparkR-pkg", subdir="pkg")
+
+# Load SparkR onto your PC
+library(SparkR)
+
+## Initialize SparkContext on your local PC
+sc <- sparkR.init(master = "local", appName = "MyApp")
+
+## Initialize SQLContext
+sqlCtx <- SparkRSQL.init(sc)
\ No newline at end of file
diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R
new file mode 100644
index 0000000000000..99e297b94f7db
--- /dev/null
+++ b/examples/src/main/r/1-data.R
@@ -0,0 +1,30 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to create Spark DataFrames 
+# Date:    06/05/2015
+#
+
+# For this example, we shall use the "flights" dataset
+# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv 
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns. 
+
+source("0-getting-started.R")
+
+# Create an R data frame and then convert it to a SparkR DataFrame -------
+
+## Create R dataframe
+install.packages("data.table") #We want to use the fread() function to read the dataset
+library(data.table)
+
+flights_df <- fread("flights.csv")
+flights_df$date <- as.Date(flights_df$date)
+
+## Convert the local data frame into a SparkR DataFrame
+flightsDF <- createDataFrame(sqlCtx, flights_df)
+
+## Print the schema of this Spark DataFrame
+printSchema(flightsDF)
+
+## Cache the DataFrame
+cache(flightsDF)
\ No newline at end of file
diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
new file mode 100644
index 0000000000000..ee9fa66d8829b
--- /dev/null
+++ b/examples/src/main/r/2-data-manipulation.R
@@ -0,0 +1,51 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to explore and manipulate Spark DataFrames 
+# Date:    06/05/2015
+#
+
+source("1-data.R")
+
+
+# Install the magrittr pipeline operator
+install.packages("magrittr")
+library(magrittr)
+
+# Print the first 6 rows of the DataFrame
+showDF(flightsDF, numRows = 6) ## Or
+head(flightsDF)
+
+# Show the column names in the DataFrame
+columns(flightsDF)
+
+# Show the number of rows in the DataFrame
+count(flightsDF)
+
+# Show summary statistics for numeric colums
+Describe(flightsDF)
+
+# Select specific columns
+destDF <- select(flightsDF, "dest", "cancelled")
+
+# Using SQL to select columns of data
+# First, register the flights DataFrame as a table
+registerTempTable(flightsDF, "flightsTable")
+destDF <- sql(sqlCtx, "SELECT dest, cancelled FROM flightsTable")
+
+# Use collect to create a local R data frame
+dest_df <- collect(destDF)
+
+# Print the newly created local data frame
+print(dest_df)
+
+# Filter flights whose destination is JFK
+jfkDF <- filter(flightsDF, "dest == JFK") ##OR
+jfkDF <- filter(flightsDF, flightsDF$dest == JFK)
+
+# Group the flights by date and then find the average daily delay
+# Write the result into a DataFrame
+groupBy(flightsDF, "date") %>%
+  avg(dep_delay = "avg", arr_delay = "avg") -> dailyDelayDF
+
+# Stop the SparkContext now
+sparkR.stop()

From 486f44ee2689e0d98bfc08a759339e6d6b80602d Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Fri, 5 Jun 2015 10:51:32 -0700
Subject: [PATCH 02/15] Added the Apache License at the file

---
 examples/src/main/r/0-getting-started.R | 28 +++++++++++++------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R
index 0c79678e675a9..d7a6d37a118f0 100644
--- a/examples/src/main/r/0-getting-started.R
+++ b/examples/src/main/r/0-getting-started.R
@@ -1,17 +1,19 @@
 #
-# Author:   Daniel Emaasit (@emaasit)
-# Purpose: This script shows how to install SparkR onto your workstation/PC
-#          and initialize a spark context and a SparkSQL context
-# Date:    06/05/2015
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
-
-
-# Install SparkR from CRAN
-install.packages("SparkR")
-
-## OR Install the dev version from Github
-install.packages(devtools)
-devtools::install_github("amplab-extras/SparkR-pkg", subdir="pkg")
 
 # Load SparkR onto your PC
 library(SparkR)
@@ -20,4 +22,4 @@ library(SparkR)
 sc <- sparkR.init(master = "local", appName = "MyApp")
 
 ## Initialize SQLContext
-sqlCtx <- SparkRSQL.init(sc)
\ No newline at end of file
+sqlCtx <- SparkRSQL.init(sc)

From 2e8f72442b7e643bd9ac0f21db1f3b0200dc84e8 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Fri, 5 Jun 2015 10:53:05 -0700
Subject: [PATCH 03/15] Added the Apache License at the top of the file

---
 examples/src/main/r/1-data.R | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R
index 99e297b94f7db..20cf6912fbda7 100644
--- a/examples/src/main/r/1-data.R
+++ b/examples/src/main/r/1-data.R
@@ -1,7 +1,18 @@
 #
-# Author:   Daniel Emaasit (@emaasit)
-# Purpose: This script shows how to create Spark DataFrames 
-# Date:    06/05/2015
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
 
 # For this example, we shall use the "flights" dataset
@@ -27,4 +38,4 @@ flightsDF <- createDataFrame(sqlCtx, flights_df)
 printSchema(flightsDF)
 
 ## Cache the DataFrame
-cache(flightsDF)
\ No newline at end of file
+cache(flightsDF)

From 275b787d8023dbf4771706f70bfe52bff5a078fc Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Fri, 5 Jun 2015 10:53:59 -0700
Subject: [PATCH 04/15] Added the Apache License at the top of the file

---
 examples/src/main/r/2-data-manipulation.R | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
index ee9fa66d8829b..f8c86e41c57e3 100644
--- a/examples/src/main/r/2-data-manipulation.R
+++ b/examples/src/main/r/2-data-manipulation.R
@@ -1,7 +1,18 @@
 #
-# Author:   Daniel Emaasit (@emaasit)
-# Purpose: This script shows how to explore and manipulate Spark DataFrames 
-# Date:    06/05/2015
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #
 
 source("1-data.R")

From 2653573ed51561ec708d1b795162467c78b1d1de Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 18:58:35 -0700
Subject: [PATCH 05/15] Updates to a comment and variable name

Now using sqlContext as the variable name
---
 examples/src/main/r/0-getting-started.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R
index d7a6d37a118f0..82b2c55f2832a 100644
--- a/examples/src/main/r/0-getting-started.R
+++ b/examples/src/main/r/0-getting-started.R
@@ -15,11 +15,11 @@
 # limitations under the License.
 #
 
-# Load SparkR onto your PC
+# Load SparkR library into your R session
 library(SparkR)
 
 ## Initialize SparkContext on your local PC
 sc <- sparkR.init(master = "local", appName = "MyApp")
 
 ## Initialize SQLContext
-sqlCtx <- SparkRSQL.init(sc)
+sqlContext <- SparkRSQL.init(sc)

From 8e0fe14677a0bffa0912861665db5b777740e1cb Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:31:42 -0700
Subject: [PATCH 06/15] provided two options for creating DataFrames

provided two options for creating DataFrames. Option 1: from local data frames and option 2: directly create DataFrames using read.df function
---
 examples/src/main/r/1-data.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R
index 20cf6912fbda7..5160ff045608b 100644
--- a/examples/src/main/r/1-data.R
+++ b/examples/src/main/r/1-data.R
@@ -22,7 +22,7 @@
 
 source("0-getting-started.R")
 
-# Create an R data frame and then convert it to a SparkR DataFrame -------
+# Option 1: Create an R data frame and then convert it to a SparkR DataFrame -------
 
 ## Create R dataframe
 install.packages("data.table") #We want to use the fread() function to read the dataset
@@ -32,10 +32,13 @@ flights_df <- fread("flights.csv")
 flights_df$date <- as.Date(flights_df$date)
 
 ## Convert the local data frame into a SparkR DataFrame
-flightsDF <- createDataFrame(sqlCtx, flights_df)
+flightsDF <- createDataFrame(sqlContext, flights_df)
 
-## Print the schema of this Spark DataFrame
+# Option 2: Alternatively, directly create a SparkR DataFrame from the source data
+flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")
+
+# Print the schema of this Spark DataFrame
 printSchema(flightsDF)
 
-## Cache the DataFrame
+# Cache the DataFrame
 cache(flightsDF)

From c6933af25816114f7fd4098c65c222dbedb5763f Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:33:29 -0700
Subject: [PATCH 07/15] changed variable name to SQLContext

---
 examples/src/main/r/2-data-manipulation.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
index f8c86e41c57e3..e0013a29bddb7 100644
--- a/examples/src/main/r/2-data-manipulation.R
+++ b/examples/src/main/r/2-data-manipulation.R
@@ -41,7 +41,7 @@ destDF <- select(flightsDF, "dest", "cancelled")
 # Using SQL to select columns of data
 # First, register the flights DataFrame as a table
 registerTempTable(flightsDF, "flightsTable")
-destDF <- sql(sqlCtx, "SELECT dest, cancelled FROM flightsTable")
+destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
 
 # Use collect to create a local R data frame
 dest_df <- collect(destDF)

From cc55cd8ab3b26a418061a625f7500cd67328ba49 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:43:16 -0700
Subject: [PATCH 08/15] combined all the code into one .R file

Deleted the source() function and combined all the code into one file
---
 examples/src/main/r/2-data-manipulation.R | 37 ++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
index e0013a29bddb7..5a96eb268208d 100644
--- a/examples/src/main/r/2-data-manipulation.R
+++ b/examples/src/main/r/2-data-manipulation.R
@@ -15,7 +15,42 @@
 # limitations under the License.
 #
 
-source("1-data.R")
+
+# Load SparkR library into your R session
+library(SparkR)
+
+## Initialize SparkContext on your local PC
+sc <- sparkR.init(master = "local", appName = "MyApp")
+
+## Initialize SQLContext
+sqlContext <- SparkRSQL.init(sc)
+
+# For this example, we shall use the "flights" dataset
+# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv 
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns. 
+
+
+# Option 1: Create an R data frame and then convert it to a SparkR DataFrame -------
+
+## Create R dataframe
+install.packages("data.table") #We want to use the fread() function to read the dataset
+library(data.table)
+
+flights_df <- fread("flights.csv")
+flights_df$date <- as.Date(flights_df$date)
+
+## Convert the local data frame into a SparkR DataFrame
+flightsDF <- createDataFrame(sqlContext, flights_df)
+
+# Option 2: Alternatively, directly create a SparkR DataFrame from the source data
+flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")
+
+# Print the schema of this Spark DataFrame
+printSchema(flightsDF)
+
+# Cache the DataFrame
+cache(flightsDF)
 
 
 # Install the magrittr pipeline operator

From b95a103f95fca521d8e79a4fb65ba5dfd368e528 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:43:36 -0700
Subject: [PATCH 09/15] Deleted this file

---
 examples/src/main/r/1-data.R | 44 ------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 examples/src/main/r/1-data.R

diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R
deleted file mode 100644
index 5160ff045608b..0000000000000
--- a/examples/src/main/r/1-data.R
+++ /dev/null
@@ -1,44 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# For this example, we shall use the "flights" dataset
-# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv 
-# The dataset consists of every flight departing Houston in 2011.
-# The data set is made up of 227,496 rows x 14 columns. 
-
-source("0-getting-started.R")
-
-# Option 1: Create an R data frame and then convert it to a SparkR DataFrame -------
-
-## Create R dataframe
-install.packages("data.table") #We want to use the fread() function to read the dataset
-library(data.table)
-
-flights_df <- fread("flights.csv")
-flights_df$date <- as.Date(flights_df$date)
-
-## Convert the local data frame into a SparkR DataFrame
-flightsDF <- createDataFrame(sqlContext, flights_df)
-
-# Option 2: Alternatively, directly create a SparkR DataFrame from the source data
-flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")
-
-# Print the schema of this Spark DataFrame
-printSchema(flightsDF)
-
-# Cache the DataFrame
-cache(flightsDF)

From 90565dd817a28d8df9920b9b1eeadb92d730bfc5 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:44:44 -0700
Subject: [PATCH 10/15] Deleted the getting-started file

Deleted the getting started file and combined all the code into one file
---
 examples/src/main/r/0-getting-started.R | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 examples/src/main/r/0-getting-started.R

diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R
deleted file mode 100644
index 82b2c55f2832a..0000000000000
--- a/examples/src/main/r/0-getting-started.R
+++ /dev/null
@@ -1,25 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Load SparkR library into your R session
-library(SparkR)
-
-## Initialize SparkContext on your local PC
-sc <- sparkR.init(master = "local", appName = "MyApp")
-
-## Initialize SQLContext
-sqlContext <- SparkRSQL.init(sc)

From b6603e341d7d315143d9c97116dd04f234623284 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 19:50:26 -0700
Subject: [PATCH 11/15] changed "Describe" function to "describe"

---
 examples/src/main/r/2-data-manipulation.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
index 5a96eb268208d..887f3425e8412 100644
--- a/examples/src/main/r/2-data-manipulation.R
+++ b/examples/src/main/r/2-data-manipulation.R
@@ -68,7 +68,7 @@ columns(flightsDF)
 count(flightsDF)
 
 # Show summary statistics for numeric colums
-Describe(flightsDF)
+describe(flightsDF)
 
 # Select specific columns
 destDF <- select(flightsDF, "dest", "cancelled")

From 33f988259f7b3a363c97f630f2d60c331626155f Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Sun, 7 Jun 2015 20:03:09 -0700
Subject: [PATCH 12/15] Renamed file

Renamed file to data-manipulation.R
---
 .../src/main/r/{2-data-manipulation.R => data-manipulation.R}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/src/main/r/{2-data-manipulation.R => data-manipulation.R} (100%)

diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/data-manipulation.R
similarity index 100%
rename from examples/src/main/r/2-data-manipulation.R
rename to examples/src/main/r/data-manipulation.R

From a550f70a8faf4556d8b592227254daccf20c9196 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Mon, 8 Jun 2015 11:31:52 -0700
Subject: [PATCH 13/15] Used base R functions

Replaced the data.table function (fread) with base R function for reading csv files (read.csv)
---
 examples/src/main/r/data-manipulation.R | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
index 887f3425e8412..2822905b2b200 100644
--- a/examples/src/main/r/data-manipulation.R
+++ b/examples/src/main/r/data-manipulation.R
@@ -19,7 +19,7 @@
 # Load SparkR library into your R session
 library(SparkR)
 
-## Initialize SparkContext on your local PC
+## Initialize SparkContext
 sc <- sparkR.init(master = "local", appName = "MyApp")
 
 ## Initialize SQLContext
@@ -31,19 +31,16 @@ sqlContext <- SparkRSQL.init(sc)
 # The data set is made up of 227,496 rows x 14 columns. 
 
 
-# Option 1: Create an R data frame and then convert it to a SparkR DataFrame -------
+# Option 1: Create a local R data frame and then convert it to a SparkR DataFrame -------
 
-## Create R dataframe
-install.packages("data.table") #We want to use the fread() function to read the dataset
-library(data.table)
-
-flights_df <- fread("flights.csv")
+## Create a local R dataframe
+flights_df <- read.csv("flights.csv")
 flights_df$date <- as.Date(flights_df$date)
 
 ## Convert the local data frame into a SparkR DataFrame
 flightsDF <- createDataFrame(sqlContext, flights_df)
 
-# Option 2: Alternatively, directly create a SparkR DataFrame from the source data
+# Option 2: Alternatively, directly create a SparkR DataFrame from the source data -------
 flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")
 
 # Print the schema of this Spark DataFrame
@@ -52,11 +49,6 @@ printSchema(flightsDF)
 # Cache the DataFrame
 cache(flightsDF)
 
-
-# Install the magrittr pipeline operator
-install.packages("magrittr")
-library(magrittr)
-
 # Print the first 6 rows of the DataFrame
 showDF(flightsDF, numRows = 6) ## Or
 head(flightsDF)
@@ -88,6 +80,9 @@ print(dest_df)
 jfkDF <- filter(flightsDF, "dest == JFK") ##OR
 jfkDF <- filter(flightsDF, flightsDF$dest == JFK)
 
+# Install the magrittr library
+library(magrittr)
+
 # Group the flights by date and then find the average daily delay
 # Write the result into a DataFrame
 groupBy(flightsDF, "date") %>%

From f7227f9935830197f9aa07b337884860c146bec3 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Tue, 9 Jun 2015 02:59:34 -0700
Subject: [PATCH 14/15] Using command line arguments

Taking in data set as a command line argument
---
 examples/src/main/r/data-manipulation.R | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
index 2822905b2b200..201bf4493208a 100644
--- a/examples/src/main/r/data-manipulation.R
+++ b/examples/src/main/r/data-manipulation.R
@@ -20,28 +20,37 @@
 library(SparkR)
 
 ## Initialize SparkContext
-sc <- sparkR.init(master = "local", appName = "MyApp")
+sc <- sparkR.init(appName = "SparkR-data-manipulation-example")
 
 ## Initialize SQLContext
 sqlContext <- SparkRSQL.init(sc)
 
 # For this example, we shall use the "flights" dataset
-# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv 
 # The dataset consists of every flight departing Houston in 2011.
 # The data set is made up of 227,496 rows x 14 columns. 
 
 
-# Option 1: Create a local R data frame and then convert it to a SparkR DataFrame -------
+args <- commandArgs(trailing = TRUE)
+if (length(args) != 1) {
+  print("Usage: data-manipulation.R <path-to-flights.csv")
+  print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
+  q("no")
+}
 
-## Create a local R dataframe
-flights_df <- read.csv("flights.csv")
+flightsCsvPath <- args[[1]]
+
+
+# # Option 1: Create a local R data frame and then convert it to a SparkR DataFrame -------
+
+# ## Create a local R dataframe
+flights_df <- read.csv(flightsCsvPath, header = TRUE)
 flights_df$date <- as.Date(flights_df$date)
 
 ## Convert the local data frame into a SparkR DataFrame
 flightsDF <- createDataFrame(sqlContext, flights_df)
 
 # Option 2: Alternatively, directly create a SparkR DataFrame from the source data -------
-flightsDF <- read.df(sqlContext, "flights.csv", source = "csv", header = "true")
+flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true")
 
 # Print the schema of this Spark DataFrame
 printSchema(flightsDF)

From 3a97867dfddd8ecd96a676b757d818f05cae4dc8 Mon Sep 17 00:00:00 2001
From: "Daniel Emaasit (PhD Student)" <daniel.emaasit@gmail.com>
Date: Wed, 10 Jun 2015 04:08:51 -0700
Subject: [PATCH 15/15] Used fewer rows for createDataFrame

To create a SparkR DataFrame, I used fewer rows of the local data frame.
---
 examples/src/main/r/data-manipulation.R | 34 ++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R
index 201bf4493208a..dcf8fd588e5eb 100644
--- a/examples/src/main/r/data-manipulation.R
+++ b/examples/src/main/r/data-manipulation.R
@@ -15,22 +15,21 @@
 # limitations under the License.
 #
 
+# For this example, we shall use the "flights" dataset
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns. 
 
 # Load SparkR library into your R session
 library(SparkR)
 
+args <- commandArgs(trailing = TRUE)
+
 ## Initialize SparkContext
 sc <- sparkR.init(appName = "SparkR-data-manipulation-example")
 
 ## Initialize SQLContext
-sqlContext <- SparkRSQL.init(sc)
+sqlContext <- sparkRSQL.init(sc)
 
-# For this example, we shall use the "flights" dataset
-# The dataset consists of every flight departing Houston in 2011.
-# The data set is made up of 227,496 rows x 14 columns. 
-
-
-args <- commandArgs(trailing = TRUE)
 if (length(args) != 1) {
   print("Usage: data-manipulation.R <path-to-flights.csv")
   print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ")
@@ -40,16 +39,17 @@ if (length(args) != 1) {
 flightsCsvPath <- args[[1]]
 
 
-# # Option 1: Create a local R data frame and then convert it to a SparkR DataFrame -------
-
-# ## Create a local R dataframe
+# Create a local R dataframe
 flights_df <- read.csv(flightsCsvPath, header = TRUE)
 flights_df$date <- as.Date(flights_df$date)
 
-## Convert the local data frame into a SparkR DataFrame
-flightsDF <- createDataFrame(sqlContext, flights_df)
+## Filter flights whose destination is San Francisco and write to a local data frame
+SFO_df <- flights_df[flights_df$dest == "SFO", ] 
+
+# Convert the local data frame into a SparkR DataFrame
+SFO_DF <- createDataFrame(sqlContext, SFO_df)
 
-# Option 2: Alternatively, directly create a SparkR DataFrame from the source data -------
+#  Directly create a SparkR DataFrame from the source data
 flightsDF <- read.df(sqlContext, flightsCsvPath, source = "csv", header = "true")
 
 # Print the schema of this Spark DataFrame
@@ -80,17 +80,17 @@ registerTempTable(flightsDF, "flightsTable")
 destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable")
 
 # Use collect to create a local R data frame
-dest_df <- collect(destDF)
+local_df <- collect(destDF)
 
 # Print the newly created local data frame
-print(dest_df)
+print(local_df)
 
 # Filter flights whose destination is JFK
 jfkDF <- filter(flightsDF, "dest == JFK") ##OR
-jfkDF <- filter(flightsDF, flightsDF$dest == JFK)
+jfkDF <- filter(flightsDF, flightsDF$dest == "JFK")
 
 # Install the magrittr library
-library(magrittr)
+if("magrittr" %in% rownames(installed.packages())) { library(magrittr) }
 
 # Group the flights by date and then find the average daily delay
 # Write the result into a DataFrame