You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
79 lines
3.1 KiB
R
79 lines
3.1 KiB
R
# Analysis of data on Course Project at Getting and Cleaning data course of Data Science track at Coursera.
|
|
|
|
# Part 1. Merges the training and the test sets to create one data set.
|
|
# 3. Uses descriptive activity names to name the activities in the data set
|
|
# 4. Appropriately labels the data set with descriptive variable names.
|
|
|
|
if (!file.exists("UCI HAR Dataset")) {
|
|
stop("You need 'UCI HAR Dataset' folder full of data")
|
|
}
|
|
|
|
|
|
library(plyr) # for mapvalues
|
|
|
|
|
|
#getting common data
|
|
features <- read.csv("UCI HAR Dataset/features.txt",sep=" ", header = FALSE,
|
|
colClasses = c("numeric","character"))
|
|
activity_labels <- read.csv("UCI HAR Dataset/activity_labels.txt",sep="",
|
|
header = FALSE,colClasses = c("numeric","character"))
|
|
|
|
#getting train set data
|
|
subject_train <- read.csv("UCI HAR Dataset/train/subject_train.txt",
|
|
header = FALSE,colClasses = "numeric",col.names="Subject")
|
|
y_train <- read.csv("UCI HAR Dataset/train/y_train.txt", header = FALSE,
|
|
colClasses = "numeric")
|
|
x_train <- read.csv("UCI HAR Dataset/train/X_train.txt",sep="", header = FALSE,
|
|
colClasses = "numeric",col.names=features$V2,check.names = FALSE)
|
|
|
|
activity_train <- as.data.frame(mapvalues(y_train$V1, from = activity_labels$V1,
|
|
to = activity_labels$V2))
|
|
names(activity_train) <- "Activity"
|
|
|
|
|
|
|
|
#getting test set data
|
|
subject_test <- read.csv("UCI HAR Dataset/test/subject_test.txt",
|
|
header = FALSE,colClasses = "numeric",col.names="Subject")
|
|
y_test <- read.csv("UCI HAR Dataset/test/y_test.txt", header = FALSE,
|
|
colClasses = "numeric")
|
|
x_test <- read.csv("UCI HAR Dataset/test/X_test.txt",sep="", header = FALSE,
|
|
colClasses = "numeric",col.names=features$V2,check.names = FALSE)
|
|
|
|
activity_test <- as.data.frame(mapvalues(y_test$V1, from = activity_labels$V1,
|
|
to = activity_labels$V2))
|
|
names(activity_test) <- "Activity"
|
|
|
|
|
|
# Forming full dataframe
|
|
data_train <- cbind(x_train,subject_train,activity_train)
|
|
data_test <- cbind(x_test,subject_test,activity_test)
|
|
data <- rbind(data_train, data_test)
|
|
|
|
# Cleaning memory
|
|
rm(features, activity_labels, subject_train, y_train, x_train, activity_train,
|
|
subject_test, y_test, x_test, activity_test, data_train, data_test)
|
|
|
|
|
|
# Part 2. Extracts only the measurements on the mean and standard deviation for each measurement.
|
|
|
|
cols2match <- grep("(mean|std)",names(data))
|
|
|
|
# Excluded gravityMean, tBodyAccMean, tBodyAccJerkMean, tBodyGyroMean,
|
|
# tBodyGyroJerkMean, as these represent derivations of angle data, as
|
|
# opposed to the original feature vector.
|
|
|
|
# Subsetting data frame, also moving last columns to be first
|
|
Subsetted_data_frame <- data[ ,c(562, 563, cols2match)]
|
|
|
|
# Part 5. From the data set in step 4, creates a second, independent tidy data set
|
|
# with the average of each variable for each activity and each subject.
|
|
|
|
library(dplyr) # for %>% and summarise_each
|
|
|
|
|
|
tidydata <- Subsetted_data_frame %>% group_by(Subject,Activity) %>%
|
|
summarise_each(funs(mean))
|
|
|
|
write.table(tidydata, "tidydata.txt", row.names=FALSE)
|