You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 lines
3.1 KiB
R

# Analysis of data on Course Project at Getting and Cleaning data course of Data Science track at Coursera.
# Part 1. Merges the training and the test sets to create one data set.
# 3. Uses descriptive activity names to name the activities in the data set
# 4. Appropriately labels the data set with descriptive variable names.
if (!file.exists("UCI HAR Dataset")) {
stop("You need 'UCI HAR Dataset' folder full of data")
}
library(plyr) # for mapvalues
#getting common data
features <- read.csv("UCI HAR Dataset/features.txt",sep=" ", header = FALSE,
colClasses = c("numeric","character"))
activity_labels <- read.csv("UCI HAR Dataset/activity_labels.txt",sep="",
header = FALSE,colClasses = c("numeric","character"))
#getting train set data
subject_train <- read.csv("UCI HAR Dataset/train/subject_train.txt",
header = FALSE,colClasses = "numeric",col.names="Subject")
y_train <- read.csv("UCI HAR Dataset/train/y_train.txt", header = FALSE,
colClasses = "numeric")
x_train <- read.csv("UCI HAR Dataset/train/X_train.txt",sep="", header = FALSE,
colClasses = "numeric",col.names=features$V2,check.names = FALSE)
activity_train <- as.data.frame(mapvalues(y_train$V1, from = activity_labels$V1,
to = activity_labels$V2))
names(activity_train) <- "Activity"
#getting test set data
subject_test <- read.csv("UCI HAR Dataset/test/subject_test.txt",
header = FALSE,colClasses = "numeric",col.names="Subject")
y_test <- read.csv("UCI HAR Dataset/test/y_test.txt", header = FALSE,
colClasses = "numeric")
x_test <- read.csv("UCI HAR Dataset/test/X_test.txt",sep="", header = FALSE,
colClasses = "numeric",col.names=features$V2,check.names = FALSE)
activity_test <- as.data.frame(mapvalues(y_test$V1, from = activity_labels$V1,
to = activity_labels$V2))
names(activity_test) <- "Activity"
# Forming full dataframe
data_train <- cbind(x_train,subject_train,activity_train)
data_test <- cbind(x_test,subject_test,activity_test)
data <- rbind(data_train, data_test)
# Cleaning memory
rm(features, activity_labels, subject_train, y_train, x_train, activity_train,
subject_test, y_test, x_test, activity_test, data_train, data_test)
# Part 2. Extracts only the measurements on the mean and standard deviation for each measurement.
cols2match <- grep("(mean|std)",names(data))
# Excluded gravityMean, tBodyAccMean, tBodyAccJerkMean, tBodyGyroMean,
# tBodyGyroJerkMean, as these represent derivations of angle data, as
# opposed to the original feature vector.
# Subsetting data frame, also moving last columns to be first
Subsetted_data_frame <- data[ ,c(562, 563, cols2match)]
# Part 5. From the data set in step 4, creates a second, independent tidy data set
# with the average of each variable for each activity and each subject.
library(dplyr) # for %>% and summarise_each
tidydata <- Subsetted_data_frame %>% group_by(Subject,Activity) %>%
summarise_each(funs(mean))
write.table(tidydata, "tidydata.txt", row.names=FALSE)