thesis/listings/run_analysis.R

# Analysis of data on Course Project at Getting and Cleaning data course of Data Science track at Coursera.

# Part 1. Merges the training and the test sets to create one data set.
# 3. Uses descriptive activity names to name the activities in the data set
# 4. Appropriately labels the data set with descriptive variable names.

if (!file.exists("UCI HAR Dataset")) {
    stop("You need 'UCI HAR Dataset' folder full of data")
}


library(plyr) # for mapvalues


#getting common data
features <- read.csv("UCI HAR Dataset/features.txt",sep=" ", header = FALSE,
                     colClasses = c("numeric","character"))
activity_labels <- read.csv("UCI HAR Dataset/activity_labels.txt",sep="",
                            header = FALSE,colClasses = c("numeric","character"))

#getting train set data
subject_train <- read.csv("UCI HAR Dataset/train/subject_train.txt",
                          header = FALSE,colClasses = "numeric",col.names="Subject")
y_train <- read.csv("UCI HAR Dataset/train/y_train.txt", header = FALSE,
                    colClasses = "numeric")
x_train <- read.csv("UCI HAR Dataset/train/X_train.txt",sep="", header = FALSE,
                    colClasses = "numeric",col.names=features$V2,check.names = FALSE)

activity_train <- as.data.frame(mapvalues(y_train$V1, from = activity_labels$V1,
                                          to = activity_labels$V2))
names(activity_train) <- "Activity"


#getting test set data
subject_test <- read.csv("UCI HAR Dataset/test/subject_test.txt",
                         header = FALSE,colClasses = "numeric",col.names="Subject")
y_test <- read.csv("UCI HAR Dataset/test/y_test.txt", header = FALSE,
                   colClasses = "numeric")
x_test <- read.csv("UCI HAR Dataset/test/X_test.txt",sep="", header = FALSE,
                   colClasses = "numeric",col.names=features$V2,check.names = FALSE)

activity_test <- as.data.frame(mapvalues(y_test$V1, from = activity_labels$V1,
                                         to = activity_labels$V2))
names(activity_test) <- "Activity"


# Forming full dataframe
data_train <- cbind(x_train,subject_train,activity_train)
data_test <- cbind(x_test,subject_test,activity_test)
data <- rbind(data_train, data_test)

# Cleaning memory
rm(features, activity_labels, subject_train, y_train, x_train, activity_train,
   subject_test, y_test, x_test, activity_test, data_train, data_test)


# Part 2. Extracts only the measurements on the mean and standard deviation for each measurement.

cols2match <- grep("(mean|std)",names(data))

# Excluded gravityMean, tBodyAccMean, tBodyAccJerkMean, tBodyGyroMean,
# tBodyGyroJerkMean, as these represent derivations of angle data, as
# opposed to the original feature vector.

# Subsetting data frame, also moving last columns to be first
Subsetted_data_frame <- data[ ,c(562, 563, cols2match)]

# Part 5. From the data set in step 4, creates a second, independent tidy data set
# with the average of each variable for each activity and each subject.

library(dplyr) # for %>% and summarise_each


tidydata <- Subsetted_data_frame %>% group_by(Subject,Activity) %>%
            summarise_each(funs(mean))

write.table(tidydata, "tidydata.txt", row.names=FALSE)