This document presents the analysis for the Data Wrangling Final Project. The goal is to apply data wrangling techniques to real-world automotive data to gain insights into various aspects of the dataset.
# Load necessary libraries
library(httr)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(writexl)
# Define years for data collection
years <- c(1973, 1984, 1993, 1994, 1995, 1996, 1997, 1998, 2000:2020)
# Initialize dataframe for recall data
recall_data <- data.frame(modelYear = integer(), Make = character(), stringsAsFactors = FALSE)
# Fetch recall data
for (year in years) {
api_url <- paste0("https://api.nhtsa.gov/products/vehicle/makes?modelYear=", year, "&issueType=r")
response <- GET(api_url)
if (status_code(response) == 200) {
year_data <- fromJSON(rawToChar(response$content), flatten = TRUE)
year_recalls <- data.frame(modelYear = year, Make = tolower(year_data$results$make))
recall_data <- rbind(recall_data, year_recalls)
}
}
# Load and preprocess the usa_cars dataset
usa_cars <- read.csv("USA_cars_datasets.csv")
usa_cars <- usa_cars %>%
mutate(model = tolower(model),
brand = tolower(brand),
Recall = ifelse(model %in% recall_data$Make & year %in% recall_data$modelYear, 'Yes', 'No'))
average_price_by_year <- usa_cars %>%
group_by(year) %>%
summarise(average_price = mean(price, na.rm = TRUE))
print("Average Car Price by Year:")
## [1] "Average Car Price by Year:"
print(average_price_by_year)
## # A tibble: 30 × 2
## year average_price
## <int> <dbl>
## 1 1973 29800
## 2 1984 25
## 3 1993 0
## 4 1994 12.5
## 5 1995 0
## 6 1996 0
## 7 1997 0
## 8 1998 6.25
## 9 1999 25
## 10 2000 43.8
## # ℹ 20 more rows
ggplot(average_price_by_year, aes(x = year, y = average_price)) +
geom_line() +
theme_minimal() +
labs(title = "Average Car Price by Year", x = "Year", y = "Average Price")
correlation_mileage_price <- cor.test(usa_cars$mileage, usa_cars$price, method = "pearson")
print("Correlation Between Mileage and Price:")
## [1] "Correlation Between Mileage and Price:"
print(correlation_mileage_price)
##
## Pearson's product-moment correlation
##
## data: usa_cars$mileage and usa_cars$price
## t = -21.863, df = 2497, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4332395 -0.3674021
## sample estimates:
## cor
## -0.4008382
ggplot(usa_cars, aes(x = mileage, y = price)) +
geom_point(alpha = 0.5) +
theme_minimal() +
labs(title = "Correlation between Mileage and Price", x = "Mileage", y = "Price")
most_common_brand <- usa_cars %>%
group_by(brand) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
slice(1)
print("Most Common Car Brand:")
## [1] "Most Common Car Brand:"
print(most_common_brand)
## # A tibble: 1 × 2
## brand count
## <chr> <int>
## 1 ford 1235
ggplot(usa_cars, aes(x = brand, fill = brand)) +
geom_bar() +
theme_minimal() +
labs(title = "Most Common Car Brand", x = "Brand", y = "Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Filtering recalled cars
recalled_cars <- usa_cars %>%
filter(Recall == 'Yes')
# Counting the number of recalled models per brand
recall_count_by_brand <- recalled_cars %>%
group_by(brand) %>%
summarise(recall_count = n()) %>%
arrange(desc(recall_count))
print("Number of Recalled Cars by Brand:")
## [1] "Number of Recalled Cars by Brand:"
print(recall_count_by_brand)
## # A tibble: 7 × 2
## brand recall_count
## <chr> <int>
## 1 ford 113
## 2 nissan 59
## 3 chevrolet 10
## 4 buick 3
## 5 jeep 3
## 6 heartland 1
## 7 toyota 1
ggplot(recall_count_by_brand, aes(x = reorder(brand, -recall_count), y = recall_count, fill = brand)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Number of Recalled Cars by Brand", x = "Brand", y = "Recall Count") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
In this analysis, we explored various aspects of automotive data, including the average car price by year, the correlation between mileage and price, the most common car brand, and the number of recalled cars by brand. These insights provide valuable information about the dataset and can inform decision-making in the automotive industry.
```
You can use this R Markdown file as a template for your project.
Replace [...]
with content specific to your analysis and
customize the report as needed. Save it with a .Rmd
extension and use RStudio to knit it into an HTML document.