Large Urban Areas population CSV missing population counts

tabular data

missing data

importance: low

Author

Egor Kotov

Published

July 16, 2024

Modified

July 17, 2024

Status: ⚠️ active

Importance: 1 - low

Summary: The dataset contains missing population counts for XX Large Urban Areas in the CSV file poblacion_gaus.csv in the zonificacion_GAU folder, as well as in the poblacion.csv file in the zonificacion folder.

Expected Results: The population data for all Large Urban Areas should be available in the CSV file poblacion_gaus.csv in the zonificacion_GAU folder and/or in the poblacion.csv file in the zonificacion folder.

Steps to Reproduce

Load Data

Load libraries and define data files.

library(tidyverse)
library(sf)
library(here)
library(DT)


gau_boundaries_data_file <- here("data/raw_data/v2/zonificacion/zonificacion_GAU/zonificacion_gaus.shp")
gau_names_file <- here("data/raw_data/v2/zonificacion/zonificacion_GAU/nombres_gaus.csv")
gau_population_file <- here("data/raw_data/v2/zonificacion/zonificacion_GAU/poblacion_gaus.csv")
all_population_file <- here("data/raw_data/v2/zonificacion/poblacion.csv")

Load the data and join the district names to the boundaries, as well as population from all population file and districts population file.

gau_boundaries <- read_sf(gau_boundaries_data_file)
gau_boundaries_spain_only <- gau_boundaries |> 
    filter(! grepl("FR|PT|externo", ID) )

gau_names <- read_delim(gau_names_file,
    delim = "|", show_col_types = FALSE, name_repair = "unique_quiet")

gau_population <- read_delim(gau_population_file, col_names = c("ID", "population"),
    delim = "|", show_col_types = FALSE, name_repair = "unique_quiet")

all_population <- read_delim(all_population_file,
    delim = "|", show_col_types = FALSE, name_repair = "unique_quiet")

gau_boundaries_spain_only <- gau_boundaries_spain_only |>
    left_join(gau_names |> select(ID, name), by = c("ID")) |> 
    left_join(gau_population, by = c("ID")) |> 
        left_join(all_population |>
                  group_by(distrito) |>
                  summarise(population_all = sum(poblacion, na.rm = TRUE), .groups = "drop") |> 
                  rename(ID = distrito),
              by = c("ID"))

Results

Missing population (loaded from the poblacion_gaus.csv in zonificacion_GAU folder)

gau_boundaries_spain_only |>
    filter(is.na(population)) |>
    nrow()

[1] 49

Population data for Large Urban Areas is also unavailable in the poblacion.csv in the zonificacion

There are no population counts for these districts in the poblacion.csv file either.

gau_boundaries_spain_only |>
    filter(!is.na(population_all)) |>
    filter(population_all == 0) |> 
    nrow()

[1] 49

Names of districts with missing population data

DT::datatable(gau_boundaries_spain_only |>
                  st_drop_geometry() |> 
                  filter(is.na(population))
                  )

Links to the original files

source(here("R/901-download-helpers.R"))
files <- load_latest_v2_xml()

# Filter relevant files
relevant_files <- files |> 
  filter(basename(local_path) %in% basename(c(gau_boundaries_data_file, gau_names_file,
                                              gau_population_file, all_population_file)) )

# Create HTML links
relevant_files <- relevant_files |> 
  mutate(target_url = paste0("<a href='", target_url, "' target='_blank'>", target_url, "</a>"))

# Render the DT table with links
datatable(relevant_files, escape = FALSE, options = list(pageLength = 5))