### Data cleaning done by Gab Chen-- clean dataset loaded directly for other group members
# setwd("C:/Users/Gab/OneDrive/Documents/SPRING 2026/CPLN5920_PPA/Lab3")
#
# # Load Property Sales Dataset
# sales_raw <- st_read("data/opa_properties_public.geojson")
#
# # Convert date
# sales_clean <- sales_raw %>%
# mutate(
# sale_date = as.Date(sale_date),
# sale_year = year(sale_date)
# )
#
# # Remove outliers
# sales_clean <- sales_clean %>%
# filter(
# sale_price > 10000, # remove extremely low prices
# sale_price < 50000000, # remove extreme luxury outliers
# )
#
# # Remove missing key fields
# sales_clean <- sales_clean %>%
# drop_na(
# sale_price,
# total_livable_area,
# number_of_bedrooms,
# number_of_bathrooms,
# )
#
# # Select necessary rows
# sales_clean <- sales_clean %>%
# select(
# category_code,
# category_code_description,
# garage_spaces,
# number_of_bathrooms,
# number_of_bedrooms,
# number_stories,
# total_livable_area,
# year_built,
# sale_year,
# sale_price,
# zip_code,
# geometry
# )
#
# # Filter to residential only
# sales_clean <- sales_clean %>%
# filter(
# grepl("1", category_code)
# )
#
# # Filter to 2023-2025 only
# sales_clean <- sales_clean %>%
# filter(sale_year >= 2023, sale_year <= 2024)
#
#
# # Save cleaned dataset
# saveRDS(sales_clean, "sales_clean.rds")
#
# # Summary table showing before and after
# tibble(
# Stage = c("Raw", "Cleaned"),
# N = c(nrow(sales_raw), nrow(sales_clean))
# )
# Load cleaned dataset
sales_clean <- readRDS("data/sales_clean.rds")
# Load variables
acs_vars_2024 <- load_variables(2024, "acs5", cache = TRUE)
# Load secondary data on access + socioeconomics + amenities + spacial structure
# Define ACS variables
census_vars <- c(
med_income = "B19013_001", # Median household income
med_home_value = "B25077_001", # Median home value
total_pop = "B01003_001", # Total population
poverty_count = "B17001_002", # Below poverty
poverty_total = "B17001_001", # Poverty universe
total_edu = "B15003_001", # Education total
edu_bachelors = "B15003_022", # Bachelor's
edu_masters = "B15003_023",
edu_prof = "B15003_024",
edu_phd = "B15003_025",
tenure_total = "B25003_001", # Total occupied
owner_occupied = "B25003_002", # Owner occupied
white = "B03002_003",
black = "B03002_004",
latinx = "B03002_012",
total_workforce = "B08124_001",
car = "B08006_002", #commute
transit = "B08006_008",
bike = "B08006_014",
walk = "B08006_015",
remote = "B08006_017",
rent_3034 = "B25070_007",
rent_3539 = "B25070_008",
rent_4049 = "B25070_009",
rent_50 = "B25070_010"
)
# Pull census data
phl_census <- get_acs(
geography = "tract",
variables = census_vars,
state = "PA",
county = "Philadelphia",
year = 2024,
survey = "acs5",
geometry = TRUE,
progress = FALSE
)
# Reshape and Mutate
phl_census_wide <- phl_census %>%
select(GEOID, NAME, variable, estimate, geometry) %>% # Drop MOE
pivot_wider(names_from = variable, values_from = estimate) %>% # Pivot to wide table
mutate(
# % College educated (Bachelor’s and above)
pct_college = (
edu_bachelors + edu_masters + edu_prof + edu_phd
) / total_edu,
# Poverty rate
poverty_rate = poverty_count / poverty_total,
# % Owner occupied
pct_owner_occ = owner_occupied / tenure_total,
# % white
white = white/total_pop,
# % black
black = black/total_pop,
# % latinx
latinx = latinx/total_pop,
# car commute
car = car/total_workforce,
# transit commute
transit = transit/total_workforce,
# bike commute
bike = bike/total_workforce,
# walk commute
walk = walk/total_workforce,
# remote commute
remote = remote/total_workforce,
# % rent burden
pct_rent_burden = (rent_3034 + rent_3539 + rent_4049 + rent_50)/total_pop
) %>%
select(
GEOID,
total_pop,
med_income,
med_home_value,
pct_college,
poverty_rate,
pct_owner_occ,
pct_rent_burden,
white,
black,
latinx,
car,
transit,
bike,
walk,
remote,
geometry
) %>%
drop_na(
med_income,
med_home_value,
pct_college,
poverty_rate,
pct_owner_occ,
pct_rent_burden,
white,
black,
latinx,
car,
transit,
bike,
walk,
remote
)
# Transit data on SEPTA bus/subways stops and regional rail stations: https://opendataphilly.org/datasets/septa-routes-stops-locations/
stops <- st_read("data/Transit_Stops_(Spring_2025)/Transit_Stops_(Spring_2025).shp")
# Parks/Green Space: https://opendataphilly.org/datasets/ppr-properties/
parks <- st_read("data/PPR_Properties/PPR_Properties.shp")
# Distance to Center City (anchor to city hall)
city_hall <- st_as_sf(
data.frame(
name = "City Hall",
lon = -75.1636,
lat = 39.9526
),
coords = c("lon", "lat"),
crs = 4326
)
# Schools: https://opendataphilly.org/datasets/schools/
schools <- st_read("data/Schools/Schools.shp")
# Crime: https://opendataphilly.org/datasets/crime-incidents/
crime <- st_read("data/incidents_part1_part2/incidents_part1_part2.shp")
# Crashes: https://opendataphilly.org/datasets/crashes/
crash <- st_read("data/collision_crash_2020_2024/collision_crash_2020_2024.shp")
# Landmarks: https://opendataphilly.org/datasets/city-landmarks/
landmarks <- st_read("data/Landmark_Points/Landmark_Points.shp")
# PA Hospitals: https://opendataphilly.org/datasets/pa-hospitals/
hospitals <- st_read("data/DOH_Hospitals202311.geojson")
# Neighborhoods: https://opendataphilly.org/datasets/philadelphia-neighborhoods/
nb <- st_read("data/philadelphia-neighborhoods/philadelphia-neighborhoods.shp")
# Convert sales data to sf
sales_sf <- st_as_sf(
sales_clean,
coords = c("longitude", "latitude"),
crs = 4326
)
# Match CRS to NAD83 Pennsylvania South (feet), 2272
sales_sf <- st_transform(sales_sf, 2272)
phl_census_wide <- st_transform(phl_census_wide, 2272)
stops <- st_transform(stops, 2272)
parks <- st_transform(parks, 2272)
city_hall <- st_transform(city_hall, 2272)
schools <- st_transform(schools, 2272)
crime <- st_transform(crime, 2272)
crash <- st_transform(crash, 2272)
landmarks <- st_transform(landmarks, 2272)
hospitals <- st_transform(hospitals, 2272)
nb <- st_transform(nb, 2272)
# Join census data to sales_clean
sales_sf <- st_join(sales_sf, phl_census_wide)
# Spatial join: Assign each house to its neighborhood
sales_sf <- sales_sf %>%
st_join(nb, join = st_intersects)
# Check results
sales_sf %>%
st_drop_geometry() %>%
count(NAME) %>%
arrange(desc(n))