Data Wrangling Final Project

Introduction

This document presents the analysis for the Data Wrangling Final Project. The goal is to apply data wrangling techniques to real-world automotive data to gain insights into various aspects of the dataset.

Data Collection

# Load necessary libraries
library(httr)
library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(writexl)

# Define years for data collection
years <- c(1973, 1984, 1993, 1994, 1995, 1996, 1997, 1998, 2000:2020)

# Initialize dataframe for recall data
recall_data <- data.frame(modelYear = integer(), Make = character(), stringsAsFactors = FALSE)

# Fetch recall data
for (year in years) {
  api_url <- paste0("https://api.nhtsa.gov/products/vehicle/makes?modelYear=", year, "&issueType=r")
  response <- GET(api_url)
  if (status_code(response) == 200) {
    year_data <- fromJSON(rawToChar(response$content), flatten = TRUE)
    year_recalls <- data.frame(modelYear = year, Make = tolower(year_data$results$make))
    recall_data <- rbind(recall_data, year_recalls)
  }
}

Data Preprocessing

# Load and preprocess the usa_cars dataset
usa_cars <- read.csv("USA_cars_datasets.csv")
usa_cars <- usa_cars %>%
  mutate(model = tolower(model),
         brand = tolower(brand),
         Recall = ifelse(model %in% recall_data$Make & year %in% recall_data$modelYear, 'Yes', 'No'))

Research Question 1: Average Car Price by Year

average_price_by_year <- usa_cars %>%
  group_by(year) %>%
  summarise(average_price = mean(price, na.rm = TRUE))
print("Average Car Price by Year:")

## [1] "Average Car Price by Year:"

print(average_price_by_year)

## # A tibble: 30 × 2
##     year average_price
##    <int>         <dbl>
##  1  1973      29800   
##  2  1984         25   
##  3  1993          0   
##  4  1994         12.5 
##  5  1995          0   
##  6  1996          0   
##  7  1997          0   
##  8  1998          6.25
##  9  1999         25   
## 10  2000         43.8 
## # ℹ 20 more rows

ggplot(average_price_by_year, aes(x = year, y = average_price)) +
  geom_line() +
  theme_minimal() +
  labs(title = "Average Car Price by Year", x = "Year", y = "Average Price")

Research Question 2: Correlation Between Mileage and Price

correlation_mileage_price <- cor.test(usa_cars$mileage, usa_cars$price, method = "pearson")
print("Correlation Between Mileage and Price:")

## [1] "Correlation Between Mileage and Price:"

print(correlation_mileage_price)

## 
##  Pearson's product-moment correlation
## 
## data:  usa_cars$mileage and usa_cars$price
## t = -21.863, df = 2497, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4332395 -0.3674021
## sample estimates:
##        cor 
## -0.4008382

ggplot(usa_cars, aes(x = mileage, y = price)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "Correlation between Mileage and Price", x = "Mileage", y = "Price")

Research Question 3: Most Common Car Brand

most_common_brand <- usa_cars %>%
  group_by(brand) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1)
print("Most Common Car Brand:")

## [1] "Most Common Car Brand:"

print(most_common_brand)

## # A tibble: 1 × 2
##   brand count
##   <chr> <int>
## 1 ford   1235

ggplot(usa_cars, aes(x = brand, fill = brand)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Most Common Car Brand", x = "Brand", y = "Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Research Question 4: Number of Recalled Cars by Brand

# Filtering recalled cars
recalled_cars <- usa_cars %>%
  filter(Recall == 'Yes')

# Counting the number of recalled models per brand
recall_count_by_brand <- recalled_cars %>%
  group_by(brand) %>%
  summarise(recall_count = n()) %>%
  arrange(desc(recall_count))
print("Number of Recalled Cars by Brand:")

## [1] "Number of Recalled Cars by Brand:"

print(recall_count_by_brand)

## # A tibble: 7 × 2
##   brand     recall_count
##   <chr>            <int>
## 1 ford               113
## 2 nissan              59
## 3 chevrolet           10
## 4 buick                3
## 5 jeep                 3
## 6 heartland            1
## 7 toyota               1

ggplot(recall_count_by_brand, aes(x = reorder(brand, -recall_count), y = recall_count, fill = brand)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Number of Recalled Cars by Brand", x = "Brand", y = "Recall Count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Conclusion

In this analysis, we explored various aspects of automotive data, including the average car price by year, the correlation between mileage and price, the most common car brand, and the number of recalled cars by brand. These insights provide valuable information about the dataset and can inform decision-making in the automotive industry.

```

You can use this R Markdown file as a template for your project. Replace [...] with content specific to your analysis and customize the report as needed. Save it with a .Rmd extension and use RStudio to knit it into an HTML document.