Important Notes
- Children dataset contains ONLY children aged 0-6
years
- All analyses use proper population weights
(coefin/10,000)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(scales)
# Update these paths to your local file locations
data_adult <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_adult.csv")
data_children <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_children.csv")
data_hh <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_hh.csv")
# Create weights (divide coefin by 10,000)
data_adult$weight <- data_adult$coefin / 10000
data_children$weight <- data_children$coefin / 10000
cat("Adult dataset dimensions:", dim(data_adult), "\n")
Adult dataset dimensions: 36844 127
cat("Children dataset dimensions:", dim(data_children), "\n")
Children dataset dimensions: 1925 24
cat("Household dataset dimensions:", dim(data_hh), "\n\n")
Household dataset dimensions: 18565 7
cat("Weighted populations:\n")
Weighted populations:
cat("Adults (15+):", format(sum(data_adult$weight), big.mark=","), "\n")
Adults (15+): 51,121,050
cat("Children (0-6):", format(sum(data_children$weight), big.mark=","), "\n")
Children (0-6): 3,006,816
# Region names mapping
region_names <- c(
'10' = "Piemonte", '20' = "Valle d'Aosta", '30' = "Liguria",
'41' = "Lombardia", '42' = "Provincia Autonoma Bolzano/Bozen",
'50' = "Provincia Autonoma Trento", '60' = "Veneto", '70' = "Friuli-Venezia Giulia",
'80' = "Emilia-Romagna", '90' = "Toscana", '100' = "Umbria",
'110' = "Marche", '120' = "Lazio", '130' = "Abruzzo",
'140' = "Molise", '150' = "Campania", '160' = "Puglia",
'170' = "Basilicata", '180' = "Calabria", '190' = "Sicilia", '200' = "Sardegna"
)
# Ripartizione mapping
rip_names <- c(
'1' = "Nord-Ovest", '2' = "Nord-Est", '3' = "Centro",
'4' = "Sud", '5' = "Isole"
)
# Add to adult data
data_adult <- data_adult %>%
mutate(
region_name = region_names[as.character(reg)],
rip_name = rip_names[as.character(rip)]
)
Data Description
This analysis uses the AVQ (Aspetti della Vita Quotidiana)
2023 survey data from ISTAT with proper population
weights (coefin variable divided by 10,000).
Dataset Overview
- Adult dataset: 36,844 individuals aged 15+
(weighted population: 51.1 million)
- Household dataset: 18,565 households (weighted:
26.2 million households)
- Children dataset: 1,925 children aged 0-6
years only (weighted population: 3.0
million)
Important Note: The children dataset contains
only children aged 0-6 years, not all children aged
0-14. This is specifically focused on the pre-school age range.
Geographic Coverage: - National level (Italy) -
Regional breakdown: 21 regions - Macro-area classification
(Ripartizioni): - Nord-Ovest (Northwest) - Nord-Est (Northeast) - Centro
(Center) - Sud (South) - Isole (Islands)
Weighting: All results use the ISTAT survey weights
(coefin/10,000) to provide population-representative estimates.
1. Family Size Distribution (Weighted)
# Get household weights (from household head)
household_weights <- data_adult %>%
filter(demo_relpar == 1) %>%
select(hhid, weight, reg, rip)
# Count family size
family_size <- data_adult %>%
group_by(hhid) %>%
summarise(family_size = n()) %>%
left_join(household_weights, by = "hhid")
total_households <- sum(family_size$weight, na.rm = TRUE)
cat("=== ITALY ===\n")
=== ITALY ===
cat("Total weighted households:", format(total_households, big.mark=","), "\n\n")
Total weighted households: 26,201,432
# Weighted distribution
family_size_dist <- family_size %>%
group_by(family_size) %>%
summarise(
households = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(percentage = households / total_households * 100)
print(family_size_dist)
# Weighted average
weighted_avg <- sum(family_size$family_size * family_size$weight, na.rm = TRUE) /
sum(family_size$weight, na.rm = TRUE)
cat("\nWeighted average family size:", round(weighted_avg, 2), "\n")
Weighted average family size: 1.95
# Lombardy
cat("\n=== LOMBARDY ===\n")
=== LOMBARDY ===
lombardy_families <- family_size %>% filter(reg == 41)
total_lom <- sum(lombardy_families$weight, na.rm = TRUE)
cat("Total weighted households:", format(total_lom, big.mark=","), "\n\n")
Total weighted households: 230,150
lom_dist <- lombardy_families %>%
group_by(family_size) %>%
summarise(
households = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(percentage = households / total_lom * 100)
print(lom_dist)
weighted_avg_lom <- sum(lombardy_families$family_size * lombardy_families$weight, na.rm = TRUE) /
sum(lombardy_families$weight, na.rm = TRUE)
cat("\nWeighted average family size:", round(weighted_avg_lom, 2), "\n")
Weighted average family size: 1.94
Visualization
# Italy
p1 <- ggplot(family_size_dist, aes(x = factor(family_size), y = households/1e6)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = sprintf("%.2fM", households/1e6)), vjust = -0.5, size = 3) +
labs(title = "Family Size Distribution - Italy (Weighted)",
x = "Family Size (members)", y = "Households (millions)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
# Lombardy
p2 <- ggplot(lom_dist, aes(x = factor(family_size), y = households/1e3)) +
geom_col(fill = "coral") +
geom_text(aes(label = sprintf("%.0fK", households/1e3)), vjust = -0.5, size = 3) +
labs(title = "Family Size Distribution - Lombardy (Weighted)",
x = "Family Size (members)", y = "Households (thousands)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
gridExtra::grid.arrange(p1, p2, ncol = 2)

2. North vs South Comparison (Weighted)
# Add macro area
family_size <- family_size %>%
mutate(macro_area = case_when(
rip %in% c(1, 2) ~ "North",
rip == 3 ~ "Center",
rip %in% c(4, 5) ~ "South"
))
# Macro area statistics
macro_stats <- family_size %>%
filter(!is.na(macro_area)) %>%
group_by(macro_area) %>%
summarise(
avg_size = sum(family_size * weight, na.rm = TRUE) / sum(weight, na.rm = TRUE),
households = sum(weight, na.rm = TRUE),
.groups = "drop"
)
cat("=== Macro Area Comparison ===\n")
=== Macro Area Comparison ===
print(macro_stats)
# By ripartizione
rip_stats <- family_size %>%
filter(!is.na(rip)) %>%
group_by(rip) %>%
summarise(
avg_size = sum(family_size * weight, na.rm = TRUE) / sum(weight, na.rm = TRUE),
households = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(rip_name = rip_names[as.character(rip)])
cat("\n=== By Ripartizione ===\n")
=== By Ripartizione ===
print(rip_stats)
Visualization
ggplot(rip_stats, aes(x = reorder(rip_name, -avg_size), y = avg_size)) +
geom_col(aes(fill = rip_name)) +
geom_hline(yintercept = weighted_avg, linetype = "dashed", color = "black") +
geom_text(aes(label = sprintf("%.2f", avg_size)), vjust = -0.5, fontface = "bold") +
annotate("text", x = 5, y = weighted_avg + 0.05, label = "National Average", size = 3) +
labs(title = "Average Family Size by Ripartizione (Weighted)",
x = "", y = "Average Family Size (members)") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 15, hjust = 1))

3. Female Employment Rate (Weighted)
# Working-age females (15-64)
working_age_females <- data_adult %>%
filter(demo_female == 1, demo_age >= 15, demo_age <= 64)
total_females <- sum(working_age_females$weight, na.rm = TRUE)
employed_females <- sum(working_age_females$weight[working_age_females$demo_empl == 1], na.rm = TRUE)
emp_rate_italy <- employed_females / total_females * 100
cat("=== ITALY ===\n")
=== ITALY ===
cat("Employment Rate:", round(emp_rate_italy, 2), "%\n")
Employment Rate: 51.14 %
cat("Employed females:", format(employed_females, big.mark=","), "\n")
Employed females: 9,472,721
cat("Working-age females:", format(total_females, big.mark=","), "\n")
Working-age females: 18,523,550
# Lombardy
lom_females <- working_age_females %>% filter(reg == 41)
total_lom_fem <- sum(lom_females$weight, na.rm = TRUE)
employed_lom <- sum(lom_females$weight[lom_females$demo_empl == 1], na.rm = TRUE)
emp_rate_lom <- employed_lom / total_lom_fem * 100
cat("\n=== LOMBARDY ===\n")
=== LOMBARDY ===
cat("Employment Rate:", round(emp_rate_lom, 2), "%\n")
Employment Rate: 66.62 %
cat("Employed females:", format(employed_lom, big.mark=","), "\n")
Employed females: 112,350.4
cat("Working-age females:", format(total_lom_fem, big.mark=","), "\n")
Working-age females: 168,637.8
# By Ripartizione
emp_by_rip <- working_age_females %>%
group_by(rip) %>%
summarise(
employed = sum(weight[demo_empl == 1], na.rm = TRUE),
total = sum(weight, na.rm = TRUE),
rate = employed / total * 100,
.groups = "drop"
) %>%
mutate(rip_name = rip_names[as.character(rip)])
cat("\n=== By Ripartizione ===\n")
=== By Ripartizione ===
print(emp_by_rip)
Visualization
ggplot(emp_by_rip, aes(x = reorder(rip_name, -rate), y = rate)) +
geom_col(aes(fill = rip_name)) +
geom_hline(yintercept = emp_rate_italy, linetype = "dashed", color = "black") +
geom_text(aes(label = sprintf("%.1f%%", rate)), vjust = -0.5, fontface = "bold") +
annotate("text", x = 5, y = emp_rate_italy + 2, label = "National Average", size = 3) +
labs(title = "Female Employment Rate (15-64) by Ripartizione (Weighted)",
x = "", y = "Employment Rate (%)") +
ylim(0, 70) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 15, hjust = 1))

4. Distribution of Children per Family (Weighted)
Note: Children dataset contains only ages 0-6
# Household info
household_info <- data_adult %>%
filter(demo_relpar == 1) %>%
select(hhid, weight, reg, rip)
# Count children per family
children_per_family <- data_children %>%
group_by(profam) %>%
summarise(
n_children = n(),
coefin = first(coefin),
.groups = "drop"
) %>%
mutate(weight = coefin / 10000) %>%
rename(hhid = profam) %>%
left_join(
data_adult %>% filter(demo_relpar == 1) %>%
select(hhid, reg, rip) %>% distinct(),
by = "hhid"
)
total_fam_children <- sum(children_per_family$weight, na.rm = TRUE)
cat("=== ITALY - Families with Children (0-6) ===\n")
=== ITALY - Families with Children (0-6) ===
cat("Total families:", format(total_fam_children, big.mark=","), "\n\n")
Total families: 2,397,253
children_dist <- children_per_family %>%
group_by(n_children) %>%
summarise(
families = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(percentage = families / total_fam_children * 100)
print(children_dist)
weighted_avg_children <- sum(children_per_family$n_children * children_per_family$weight, na.rm = TRUE) /
sum(children_per_family$weight, na.rm = TRUE)
cat("\nWeighted average:", round(weighted_avg_children, 2), "children per family\n")
Weighted average: 1.25 children per family
# Lombardy
lom_children_fam <- children_per_family %>% filter(reg == 41)
total_lom_fam <- sum(lom_children_fam$weight, na.rm = TRUE)
cat("\n=== LOMBARDY - Families with Children (0-6) ===\n")
=== LOMBARDY - Families with Children (0-6) ===
cat("Total families:", format(total_lom_fam, big.mark=","), "\n\n")
Total families: 27,147.6
lom_children_dist <- lom_children_fam %>%
group_by(n_children) %>%
summarise(
families = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(percentage = families / total_lom_fam * 100)
print(lom_children_dist)
weighted_avg_lom_child <- sum(lom_children_fam$n_children * lom_children_fam$weight, na.rm = TRUE) /
sum(lom_children_fam$weight, na.rm = TRUE)
cat("\nWeighted average:", round(weighted_avg_lom_child, 2), "children per family\n")
Weighted average: 1.33 children per family
# All families (including those without children 0-6)
all_families <- household_info %>%
left_join(children_per_family %>% select(hhid, n_children), by = "hhid") %>%
mutate(n_children = ifelse(is.na(n_children), 0, n_children))
total_all_fam <- sum(all_families$weight, na.rm = TRUE)
cat("\n=== ALL FAMILIES (including without children 0-6) ===\n")
=== ALL FAMILIES (including without children 0-6) ===
cat("Total families:", format(total_all_fam, big.mark=","), "\n\n")
Total families: 26,201,432
all_children_dist <- all_families %>%
group_by(n_children) %>%
summarise(
families = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(percentage = families / total_all_fam * 100)
print(head(all_children_dist, 6))
Visualization
p1 <- ggplot(children_dist, aes(x = factor(n_children), y = families/1e3)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = sprintf("%.0fK", families/1e3)), vjust = -0.5, size = 3) +
labs(title = "Children per Family - Among Families with Children (Weighted)",
x = "Number of Children (0-6)", y = "Families (thousands)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
p2 <- ggplot(head(all_children_dist, 5), aes(x = factor(n_children), y = families/1e6)) +
geom_col(fill = "coral") +
geom_text(aes(label = sprintf("%.2fM", families/1e6)), vjust = -0.5, size = 3) +
labs(title = "Children per Family - All Families (Weighted)",
x = "Number of Children (0-6)", y = "Families (millions)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
gridExtra::grid.arrange(p1, p2, ncol = 2)

5. Working Mothers (Weighted)
# Employed females aged 15-64
employed_females_data <- data_adult %>%
filter(demo_female == 1, demo_empl == 1, demo_age >= 15, demo_age <= 64)
total_emp_women <- sum(employed_females_data$weight, na.rm = TRUE)
working_mothers <- sum(employed_females_data$weight[employed_females_data$demo_mother == 1], na.rm = TRUE)
mothers_share_italy <- working_mothers / total_emp_women * 100
cat("=== ITALY ===\n")
=== ITALY ===
cat("Total employed women:", format(total_emp_women, big.mark=","), "\n")
Total employed women: 9,472,721
cat("Working mothers:", format(working_mothers, big.mark=","), "\n")
Working mothers: 6,337,612
cat("Share:", round(mothers_share_italy, 2), "%\n")
Share: 66.9 %
# Lombardy
lom_emp <- employed_females_data %>% filter(reg == 41)
total_lom_emp <- sum(lom_emp$weight, na.rm = TRUE)
lom_mothers <- sum(lom_emp$weight[lom_emp$demo_mother == 1], na.rm = TRUE)
mothers_share_lom <- lom_mothers / total_lom_emp * 100
cat("\n=== LOMBARDY ===\n")
=== LOMBARDY ===
cat("Total employed women:", format(total_lom_emp, big.mark=","), "\n")
Total employed women: 112,350.4
cat("Working mothers:", format(lom_mothers, big.mark=","), "\n")
Working mothers: 74,324.23
cat("Share:", round(mothers_share_lom, 2), "%\n")
Share: 66.15 %
# By Ripartizione
mothers_by_rip <- employed_females_data %>%
group_by(rip) %>%
summarise(
working_mothers = sum(weight[demo_mother == 1], na.rm = TRUE),
total_employed = sum(weight, na.rm = TRUE),
share = working_mothers / total_employed * 100,
.groups = "drop"
) %>%
mutate(rip_name = rip_names[as.character(rip)])
cat("\n=== By Ripartizione ===\n")
=== By Ripartizione ===
print(mothers_by_rip)
Visualization
ggplot(mothers_by_rip, aes(x = reorder(rip_name, -share), y = share)) +
geom_col(aes(fill = rip_name)) +
geom_hline(yintercept = mothers_share_italy, linetype = "dashed", color = "black") +
geom_text(aes(label = sprintf("%.1f%%", share)), vjust = -0.5, fontface = "bold") +
annotate("text", x = 5, y = mothers_share_italy + 1.5, label = "National Average", size = 3) +
labs(title = "Working Mothers Among Employed Women by Ripartizione (Weighted)",
x = "", y = "Share of Working Mothers (%)") +
ylim(0, 80) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 15, hjust = 1))

6. Pre-School Attendance (Weighted)
Note: Children dataset contains ONLY ages 0-6
total_children <- sum(data_children$weight, na.rm = TRUE)
# Merge with household info
children_with_region <- data_children %>%
left_join(
data_adult %>% filter(demo_relpar == 1) %>%
select(hhid, reg, rip) %>% distinct(),
by = c("profam" = "hhid")
)
# Formal childcare
attending_formal <- sum(children_with_region$weight[children_with_region$childcare_formal == 1], na.rm = TRUE)
not_attending <- sum(children_with_region$weight[children_with_region$childcare_formal == 0], na.rm = TRUE)
attendance_rate <- attending_formal / total_children * 100
cat("=== ITALY ===\n")
=== ITALY ===
cat("Total children (0-6):", format(total_children, big.mark=","), "\n")
Total children (0-6): 3,006,816
cat("Children in formal childcare:", format(attending_formal, big.mark=","), "\n")
Children in formal childcare: 1,613,274
cat("Children not in formal childcare:", format(not_attending, big.mark=","), "\n")
Children not in formal childcare: 1,393,543
cat("Attendance rate:", round(attendance_rate, 2), "%\n")
Attendance rate: 53.65 %
# By type of care
care_types <- children_with_region %>%
group_by(care_enrolled) %>%
summarise(
children = sum(weight, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
percentage = children / total_children * 100,
care_type = case_when(
care_enrolled == 1 ~ "Asilo Nido (Nursery, 0-2)",
care_enrolled == 14 ~ "Not enrolled",
care_enrolled == 15 ~ "Scuola Infanzia (Kindergarten, 3-5)",
care_enrolled == 16 ~ "Scuola Primaria (Primary, 6+)",
TRUE ~ as.character(care_enrolled)
)
)
cat("\n=== By Type of Care ===\n")
=== By Type of Care ===
print(care_types %>% select(care_type, children, percentage))
# Pre-school specific
preschool_rate <- sum(children_with_region$weight[children_with_region$care_enrolled %in% c(1, 15)], na.rm = TRUE) /
total_children * 100
cat("\nPre-school (Asilo Nido + Scuola Infanzia) rate:", round(preschool_rate, 2), "%\n")
Pre-school (Asilo Nido + Scuola Infanzia) rate: 70.47 %
# Lombardy
lom_children <- children_with_region %>% filter(reg == 41)
total_lom_child <- sum(lom_children$weight, na.rm = TRUE)
lom_attending <- sum(lom_children$weight[lom_children$childcare_formal == 1], na.rm = TRUE)
lom_attendance <- lom_attending / total_lom_child * 100
cat("\n=== LOMBARDY ===\n")
=== LOMBARDY ===
cat("Total children (0-6):", format(total_lom_child, big.mark=","), "\n")
Total children (0-6): 36,075.57
cat("Children in formal childcare:", format(lom_attending, big.mark=","), "\n")
Children in formal childcare: 16,995.04
cat("Attendance rate:", round(lom_attendance, 2), "%\n")
Attendance rate: 47.11 %
# By Ripartizione
attendance_by_rip <- children_with_region %>%
group_by(rip) %>%
summarise(
attending = sum(weight[childcare_formal == 1], na.rm = TRUE),
total = sum(weight, na.rm = TRUE),
rate = attending / total * 100,
.groups = "drop"
) %>%
mutate(rip_name = rip_names[as.character(rip)])
cat("\n=== By Ripartizione ===\n")
=== By Ripartizione ===
print(attendance_by_rip)
Visualization
ggplot(attendance_by_rip, aes(x = reorder(rip_name, -rate), y = rate)) +
geom_col(aes(fill = rip_name)) +
geom_hline(yintercept = attendance_rate, linetype = "dashed", color = "black") +
geom_text(aes(label = sprintf("%.1f%%", rate)), vjust = -0.5, fontface = "bold") +
annotate("text", x = 5, y = attendance_rate + 2, label = "National Average", size = 3) +
labs(title = "Pre-School Attendance Rate by Ripartizione (Children 0-6, Weighted)",
x = "", y = "Attendance Rate (%)") +
ylim(0, 70) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold"),
axis.text.x = element_text(angle = 15, hjust = 1))

Summary Statistics Table
summary_data <- data.frame(
Indicator = c(
"Total households (millions)",
"Average family size (Italy)",
"Average family size (Lombardy)",
"Average family size (North)",
"Average family size (South)",
"Female employment rate - Italy (%)",
"Female employment rate - Lombardy (%)",
"Female employment rate - North (%)",
"Female employment rate - South (%)",
"Working mothers share - Italy (%)",
"Working mothers share - Sud (%)",
"Pre-school attendance - Italy (%)",
"Pre-school attendance - Lombardy (%)",
"Children (0-6) per family (with children)"
),
Value = c(
round(total_households/1e6, 2),
round(weighted_avg, 2),
round(weighted_avg_lom, 2),
round(macro_stats$avg_size[macro_stats$macro_area == "North"], 2),
round(macro_stats$avg_size[macro_stats$macro_area == "South"], 2),
round(emp_rate_italy, 2),
round(emp_rate_lom, 2),
round(mean(emp_by_rip$rate[emp_by_rip$rip %in% c(1,2)]), 2),
round(mean(emp_by_rip$rate[emp_by_rip$rip %in% c(4,5)]), 2),
round(mothers_share_italy, 2),
round(mothers_by_rip$share[mothers_by_rip$rip == 4], 2),
round(attendance_rate, 2),
round(lom_attendance, 2),
round(weighted_avg_children, 2)
)
)
kable(summary_data, caption = "Key Indicators Summary (Weighted)")
Key Indicators Summary (Weighted)
| Indicator |
Value |
| Total households (millions) |
26.20 |
| Average family size (Italy) |
1.95 |
| Average family size (Lombardy) |
1.94 |
| Average family size (North) |
1.90 |
| Average family size (South) |
2.05 |
| Female employment rate - Italy (%) |
51.14 |
| Female employment rate - Lombardy (%) |
66.62 |
| Female employment rate - North (%) |
60.70 |
| Female employment rate - South (%) |
35.13 |
| Working mothers share - Italy (%) |
66.90 |
| Working mothers share - Sud (%) |
73.69 |
| Pre-school attendance - Italy (%) |
53.65 |
| Pre-school attendance - Lombardy (%) |
47.11 |
| Children (0-6) per family (with children) |
1.25 |
Report generated: 2025-11-20
All analyses use proper population weights
(coefin/10,000)
Points of Discussion
- The AVQ survey doesn’t have any questions regarding religious stance
or conservativeness. I was thinking that maybe this can be proxied with
Euroskepticism?
There is a question asking about ‘how much do you believe in:
European Parliament?’
There are also several questions asking the perception of several
disorders on their community, such as:
How safe do you feel walking on the street when it’s dark and
you’re alone in the area where you live?
In the area where you live, how often do you see: people using
drugs
In the area where you live, how often do you see: people dealing
drugs
In the area where you live, how often do you see: prostitutes
looking for clients
In the area where you live, how often do you see: acts of
vandalism against public property (broken phone booths, trash bins,
etc.)
In the area where you live, how often do you see: vagrants,
people with no fixed address -
---
title: "Huggins Briefing - 2025/11/19"
subtitle: "Data Description and Preliminary Analysis"
output: html_notebook
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

## Important Notes

1. **Children dataset contains ONLY children aged 0-6 years**
2. **All analyses use proper population weights** (coefin/10,000)

```{r load_libraries}
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(scales)
```

```{r load_data}
# Update these paths to your local file locations
data_adult <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_adult.csv")
data_children <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_children.csv")
data_hh <- read.csv("C:/Users/satya/OneDrive - Politecnico di Milano/Joint Folder - Pekerti & Fratesi/cardiff_periodabroad/cardiff_project/analysis/data/avq2023_sub_hh.csv")

# Create weights (divide coefin by 10,000)
data_adult$weight <- data_adult$coefin / 10000
data_children$weight <- data_children$coefin / 10000
```

```{r data_overview}
cat("Adult dataset dimensions:", dim(data_adult), "\n")
cat("Children dataset dimensions:", dim(data_children), "\n")
cat("Household dataset dimensions:", dim(data_hh), "\n\n")

cat("Weighted populations:\n")
cat("Adults (15+):", format(sum(data_adult$weight), big.mark=","), "\n")
cat("Children (0-6):", format(sum(data_children$weight), big.mark=","), "\n")
```

```{r region_mapping}
# Region names mapping
region_names <- c(
  '10' = "Piemonte", '20' = "Valle d'Aosta", '30' = "Liguria",
  '41' = "Lombardia", '42' = "Provincia Autonoma Bolzano/Bozen",
  '50' = "Provincia Autonoma Trento", '60' = "Veneto", '70' = "Friuli-Venezia Giulia",
  '80' = "Emilia-Romagna", '90' = "Toscana", '100' = "Umbria",
  '110' = "Marche", '120' = "Lazio", '130' = "Abruzzo",
  '140' = "Molise", '150' = "Campania", '160' = "Puglia",
  '170' = "Basilicata", '180' = "Calabria", '190' = "Sicilia", '200' = "Sardegna"
)

# Ripartizione mapping
rip_names <- c(
  '1' = "Nord-Ovest", '2' = "Nord-Est", '3' = "Centro",
  '4' = "Sud", '5' = "Isole"
)

# Add to adult data
data_adult <- data_adult %>%
  mutate(
    region_name = region_names[as.character(reg)],
    rip_name = rip_names[as.character(rip)]
  )
```

# Data Description

This analysis uses the **AVQ (Aspetti della Vita Quotidiana) 2023** survey data from ISTAT with **proper population weights** (coefin variable divided by 10,000).

## Dataset Overview

- **Adult dataset**: 36,844 individuals aged 15+ (weighted population: **51.1 million**)
- **Household dataset**: 18,565 households (weighted: **26.2 million households**)
- **Children dataset**: 1,925 children **aged 0-6 years only** (weighted population: **3.0 million**)

**Important Note**: The children dataset contains **only children aged 0-6 years**, not all children aged 0-14. This is specifically focused on the pre-school age range.

**Geographic Coverage:**
- National level (Italy)
- Regional breakdown: 21 regions
- Macro-area classification (Ripartizioni):
  - Nord-Ovest (Northwest)
  - Nord-Est (Northeast)
  - Centro (Center)
  - Sud (South)
  - Isole (Islands)

**Weighting**: All results use the ISTAT survey weights (coefin/10,000) to provide population-representative estimates.

---

# 1. Family Size Distribution (Weighted)

```{r family_size}
# Get household weights (from household head)
household_weights <- data_adult %>%
  filter(demo_relpar == 1) %>%
  select(hhid, weight, reg, rip)

# Count family size
family_size <- data_adult %>%
  group_by(hhid) %>%
  summarise(family_size = n()) %>%
  left_join(household_weights, by = "hhid")

total_households <- sum(family_size$weight, na.rm = TRUE)

cat("=== ITALY ===\n")
cat("Total weighted households:", format(total_households, big.mark=","), "\n\n")

# Weighted distribution
family_size_dist <- family_size %>%
  group_by(family_size) %>%
  summarise(
    households = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(percentage = households / total_households * 100)

print(family_size_dist)

# Weighted average
weighted_avg <- sum(family_size$family_size * family_size$weight, na.rm = TRUE) / 
                sum(family_size$weight, na.rm = TRUE)
cat("\nWeighted average family size:", round(weighted_avg, 2), "\n")

# Lombardy
cat("\n=== LOMBARDY ===\n")
lombardy_families <- family_size %>% filter(reg == 41)
total_lom <- sum(lombardy_families$weight, na.rm = TRUE)
cat("Total weighted households:", format(total_lom, big.mark=","), "\n\n")

lom_dist <- lombardy_families %>%
  group_by(family_size) %>%
  summarise(
    households = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(percentage = households / total_lom * 100)

print(lom_dist)

weighted_avg_lom <- sum(lombardy_families$family_size * lombardy_families$weight, na.rm = TRUE) / 
                    sum(lombardy_families$weight, na.rm = TRUE)
cat("\nWeighted average family size:", round(weighted_avg_lom, 2), "\n")
```

### Visualization

```{r family_size_plot, fig.width=12, fig.height=5}
# Italy
p1 <- ggplot(family_size_dist, aes(x = factor(family_size), y = households/1e6)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = sprintf("%.2fM", households/1e6)), vjust = -0.5, size = 3) +
  labs(title = "Family Size Distribution - Italy (Weighted)",
       x = "Family Size (members)", y = "Households (millions)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

# Lombardy
p2 <- ggplot(lom_dist, aes(x = factor(family_size), y = households/1e3)) +
  geom_col(fill = "coral") +
  geom_text(aes(label = sprintf("%.0fK", households/1e3)), vjust = -0.5, size = 3) +
  labs(title = "Family Size Distribution - Lombardy (Weighted)",
       x = "Family Size (members)", y = "Households (thousands)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

gridExtra::grid.arrange(p1, p2, ncol = 2)
```

---

# 2. North vs South Comparison (Weighted)

```{r north_south}
# Add macro area
family_size <- family_size %>%
  mutate(macro_area = case_when(
    rip %in% c(1, 2) ~ "North",
    rip == 3 ~ "Center",
    rip %in% c(4, 5) ~ "South"
  ))

# Macro area statistics
macro_stats <- family_size %>%
  filter(!is.na(macro_area)) %>%
  group_by(macro_area) %>%
  summarise(
    avg_size = sum(family_size * weight, na.rm = TRUE) / sum(weight, na.rm = TRUE),
    households = sum(weight, na.rm = TRUE),
    .groups = "drop"
  )

cat("=== Macro Area Comparison ===\n")
print(macro_stats)

# By ripartizione
rip_stats <- family_size %>%
  filter(!is.na(rip)) %>%
  group_by(rip) %>%
  summarise(
    avg_size = sum(family_size * weight, na.rm = TRUE) / sum(weight, na.rm = TRUE),
    households = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(rip_name = rip_names[as.character(rip)])

cat("\n=== By Ripartizione ===\n")
print(rip_stats)
```

### Visualization

```{r north_south_plot, fig.width=10, fig.height=6}
ggplot(rip_stats, aes(x = reorder(rip_name, -avg_size), y = avg_size)) +
  geom_col(aes(fill = rip_name)) +
  geom_hline(yintercept = weighted_avg, linetype = "dashed", color = "black") +
  geom_text(aes(label = sprintf("%.2f", avg_size)), vjust = -0.5, fontface = "bold") +
  annotate("text", x = 5, y = weighted_avg + 0.05, label = "National Average", size = 3) +
  labs(title = "Average Family Size by Ripartizione (Weighted)",
       x = "", y = "Average Family Size (members)") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"),
        axis.text.x = element_text(angle = 15, hjust = 1))
```

---

# 3. Female Employment Rate (Weighted)

```{r female_employment}
# Working-age females (15-64)
working_age_females <- data_adult %>%
  filter(demo_female == 1, demo_age >= 15, demo_age <= 64)

total_females <- sum(working_age_females$weight, na.rm = TRUE)
employed_females <- sum(working_age_females$weight[working_age_females$demo_empl == 1], na.rm = TRUE)

emp_rate_italy <- employed_females / total_females * 100

cat("=== ITALY ===\n")
cat("Employment Rate:", round(emp_rate_italy, 2), "%\n")
cat("Employed females:", format(employed_females, big.mark=","), "\n")
cat("Working-age females:", format(total_females, big.mark=","), "\n")

# Lombardy
lom_females <- working_age_females %>% filter(reg == 41)
total_lom_fem <- sum(lom_females$weight, na.rm = TRUE)
employed_lom <- sum(lom_females$weight[lom_females$demo_empl == 1], na.rm = TRUE)
emp_rate_lom <- employed_lom / total_lom_fem * 100

cat("\n=== LOMBARDY ===\n")
cat("Employment Rate:", round(emp_rate_lom, 2), "%\n")
cat("Employed females:", format(employed_lom, big.mark=","), "\n")
cat("Working-age females:", format(total_lom_fem, big.mark=","), "\n")

# By Ripartizione
emp_by_rip <- working_age_females %>%
  group_by(rip) %>%
  summarise(
    employed = sum(weight[demo_empl == 1], na.rm = TRUE),
    total = sum(weight, na.rm = TRUE),
    rate = employed / total * 100,
    .groups = "drop"
  ) %>%
  mutate(rip_name = rip_names[as.character(rip)])

cat("\n=== By Ripartizione ===\n")
print(emp_by_rip)
```

### Visualization

```{r employment_plot, fig.width=10, fig.height=6}
ggplot(emp_by_rip, aes(x = reorder(rip_name, -rate), y = rate)) +
  geom_col(aes(fill = rip_name)) +
  geom_hline(yintercept = emp_rate_italy, linetype = "dashed", color = "black") +
  geom_text(aes(label = sprintf("%.1f%%", rate)), vjust = -0.5, fontface = "bold") +
  annotate("text", x = 5, y = emp_rate_italy + 2, label = "National Average", size = 3) +
  labs(title = "Female Employment Rate (15-64) by Ripartizione (Weighted)",
       x = "", y = "Employment Rate (%)") +
  ylim(0, 70) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"),
        axis.text.x = element_text(angle = 15, hjust = 1))
```

---

# 4. Distribution of Children per Family (Weighted)

**Note**: Children dataset contains only ages 0-6

```{r children_distribution}
# Household info
household_info <- data_adult %>%
  filter(demo_relpar == 1) %>%
  select(hhid, weight, reg, rip)

# Count children per family
children_per_family <- data_children %>%
  group_by(profam) %>%
  summarise(
    n_children = n(),
    coefin = first(coefin),
    .groups = "drop"
  ) %>%
  mutate(weight = coefin / 10000) %>%
  rename(hhid = profam) %>%
  left_join(
    data_adult %>% filter(demo_relpar == 1) %>% 
      select(hhid, reg, rip) %>% distinct(),
    by = "hhid"
  )

total_fam_children <- sum(children_per_family$weight, na.rm = TRUE)

cat("=== ITALY - Families with Children (0-6) ===\n")
cat("Total families:", format(total_fam_children, big.mark=","), "\n\n")

children_dist <- children_per_family %>%
  group_by(n_children) %>%
  summarise(
    families = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(percentage = families / total_fam_children * 100)

print(children_dist)

weighted_avg_children <- sum(children_per_family$n_children * children_per_family$weight, na.rm = TRUE) / 
                         sum(children_per_family$weight, na.rm = TRUE)
cat("\nWeighted average:", round(weighted_avg_children, 2), "children per family\n")

# Lombardy
lom_children_fam <- children_per_family %>% filter(reg == 41)
total_lom_fam <- sum(lom_children_fam$weight, na.rm = TRUE)

cat("\n=== LOMBARDY - Families with Children (0-6) ===\n")
cat("Total families:", format(total_lom_fam, big.mark=","), "\n\n")

lom_children_dist <- lom_children_fam %>%
  group_by(n_children) %>%
  summarise(
    families = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(percentage = families / total_lom_fam * 100)

print(lom_children_dist)

weighted_avg_lom_child <- sum(lom_children_fam$n_children * lom_children_fam$weight, na.rm = TRUE) / 
                          sum(lom_children_fam$weight, na.rm = TRUE)
cat("\nWeighted average:", round(weighted_avg_lom_child, 2), "children per family\n")

# All families (including those without children 0-6)
all_families <- household_info %>%
  left_join(children_per_family %>% select(hhid, n_children), by = "hhid") %>%
  mutate(n_children = ifelse(is.na(n_children), 0, n_children))

total_all_fam <- sum(all_families$weight, na.rm = TRUE)

cat("\n=== ALL FAMILIES (including without children 0-6) ===\n")
cat("Total families:", format(total_all_fam, big.mark=","), "\n\n")

all_children_dist <- all_families %>%
  group_by(n_children) %>%
  summarise(
    families = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(percentage = families / total_all_fam * 100)

print(head(all_children_dist, 6))
```

### Visualization

```{r children_plot, fig.width=12, fig.height=5}
p1 <- ggplot(children_dist, aes(x = factor(n_children), y = families/1e3)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = sprintf("%.0fK", families/1e3)), vjust = -0.5, size = 3) +
  labs(title = "Children per Family - Among Families with Children (Weighted)",
       x = "Number of Children (0-6)", y = "Families (thousands)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

p2 <- ggplot(head(all_children_dist, 5), aes(x = factor(n_children), y = families/1e6)) +
  geom_col(fill = "coral") +
  geom_text(aes(label = sprintf("%.2fM", families/1e6)), vjust = -0.5, size = 3) +
  labs(title = "Children per Family - All Families (Weighted)",
       x = "Number of Children (0-6)", y = "Families (millions)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

gridExtra::grid.arrange(p1, p2, ncol = 2)
```

---

# 5. Working Mothers (Weighted)

```{r working_mothers}
# Employed females aged 15-64
employed_females_data <- data_adult %>%
  filter(demo_female == 1, demo_empl == 1, demo_age >= 15, demo_age <= 64)

total_emp_women <- sum(employed_females_data$weight, na.rm = TRUE)
working_mothers <- sum(employed_females_data$weight[employed_females_data$demo_mother == 1], na.rm = TRUE)

mothers_share_italy <- working_mothers / total_emp_women * 100

cat("=== ITALY ===\n")
cat("Total employed women:", format(total_emp_women, big.mark=","), "\n")
cat("Working mothers:", format(working_mothers, big.mark=","), "\n")
cat("Share:", round(mothers_share_italy, 2), "%\n")

# Lombardy
lom_emp <- employed_females_data %>% filter(reg == 41)
total_lom_emp <- sum(lom_emp$weight, na.rm = TRUE)
lom_mothers <- sum(lom_emp$weight[lom_emp$demo_mother == 1], na.rm = TRUE)
mothers_share_lom <- lom_mothers / total_lom_emp * 100

cat("\n=== LOMBARDY ===\n")
cat("Total employed women:", format(total_lom_emp, big.mark=","), "\n")
cat("Working mothers:", format(lom_mothers, big.mark=","), "\n")
cat("Share:", round(mothers_share_lom, 2), "%\n")

# By Ripartizione
mothers_by_rip <- employed_females_data %>%
  group_by(rip) %>%
  summarise(
    working_mothers = sum(weight[demo_mother == 1], na.rm = TRUE),
    total_employed = sum(weight, na.rm = TRUE),
    share = working_mothers / total_employed * 100,
    .groups = "drop"
  ) %>%
  mutate(rip_name = rip_names[as.character(rip)])

cat("\n=== By Ripartizione ===\n")
print(mothers_by_rip)
```

### Visualization

```{r mothers_plot, fig.width=10, fig.height=6}
ggplot(mothers_by_rip, aes(x = reorder(rip_name, -share), y = share)) +
  geom_col(aes(fill = rip_name)) +
  geom_hline(yintercept = mothers_share_italy, linetype = "dashed", color = "black") +
  geom_text(aes(label = sprintf("%.1f%%", share)), vjust = -0.5, fontface = "bold") +
  annotate("text", x = 5, y = mothers_share_italy + 1.5, label = "National Average", size = 3) +
  labs(title = "Working Mothers Among Employed Women by Ripartizione (Weighted)",
       x = "", y = "Share of Working Mothers (%)") +
  ylim(0, 80) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"),
        axis.text.x = element_text(angle = 15, hjust = 1))
```

---

# 6. Pre-School Attendance (Weighted)

**Note**: Children dataset contains ONLY ages 0-6

```{r preschool}
total_children <- sum(data_children$weight, na.rm = TRUE)

# Merge with household info
children_with_region <- data_children %>%
  left_join(
    data_adult %>% filter(demo_relpar == 1) %>% 
      select(hhid, reg, rip) %>% distinct(),
    by = c("profam" = "hhid")
  )

# Formal childcare
attending_formal <- sum(children_with_region$weight[children_with_region$childcare_formal == 1], na.rm = TRUE)
not_attending <- sum(children_with_region$weight[children_with_region$childcare_formal == 0], na.rm = TRUE)

attendance_rate <- attending_formal / total_children * 100

cat("=== ITALY ===\n")
cat("Total children (0-6):", format(total_children, big.mark=","), "\n")
cat("Children in formal childcare:", format(attending_formal, big.mark=","), "\n")
cat("Children not in formal childcare:", format(not_attending, big.mark=","), "\n")
cat("Attendance rate:", round(attendance_rate, 2), "%\n")

# By type of care
care_types <- children_with_region %>%
  group_by(care_enrolled) %>%
  summarise(
    children = sum(weight, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    percentage = children / total_children * 100,
    care_type = case_when(
      care_enrolled == 1 ~ "Asilo Nido (Nursery, 0-2)",
      care_enrolled == 14 ~ "Not enrolled",
      care_enrolled == 15 ~ "Scuola Infanzia (Kindergarten, 3-5)",
      care_enrolled == 16 ~ "Scuola Primaria (Primary, 6+)",
      TRUE ~ as.character(care_enrolled)
    )
  )

cat("\n=== By Type of Care ===\n")
print(care_types %>% select(care_type, children, percentage))

# Pre-school specific
preschool_rate <- sum(children_with_region$weight[children_with_region$care_enrolled %in% c(1, 15)], na.rm = TRUE) / 
                  total_children * 100
cat("\nPre-school (Asilo Nido + Scuola Infanzia) rate:", round(preschool_rate, 2), "%\n")

# Lombardy
lom_children <- children_with_region %>% filter(reg == 41)
total_lom_child <- sum(lom_children$weight, na.rm = TRUE)
lom_attending <- sum(lom_children$weight[lom_children$childcare_formal == 1], na.rm = TRUE)
lom_attendance <- lom_attending / total_lom_child * 100

cat("\n=== LOMBARDY ===\n")
cat("Total children (0-6):", format(total_lom_child, big.mark=","), "\n")
cat("Children in formal childcare:", format(lom_attending, big.mark=","), "\n")
cat("Attendance rate:", round(lom_attendance, 2), "%\n")

# By Ripartizione
attendance_by_rip <- children_with_region %>%
  group_by(rip) %>%
  summarise(
    attending = sum(weight[childcare_formal == 1], na.rm = TRUE),
    total = sum(weight, na.rm = TRUE),
    rate = attending / total * 100,
    .groups = "drop"
  ) %>%
  mutate(rip_name = rip_names[as.character(rip)])

cat("\n=== By Ripartizione ===\n")
print(attendance_by_rip)
```

### Visualization

```{r preschool_plot, fig.width=10, fig.height=6}
ggplot(attendance_by_rip, aes(x = reorder(rip_name, -rate), y = rate)) +
  geom_col(aes(fill = rip_name)) +
  geom_hline(yintercept = attendance_rate, linetype = "dashed", color = "black") +
  geom_text(aes(label = sprintf("%.1f%%", rate)), vjust = -0.5, fontface = "bold") +
  annotate("text", x = 5, y = attendance_rate + 2, label = "National Average", size = 3) +
  labs(title = "Pre-School Attendance Rate by Ripartizione (Children 0-6, Weighted)",
       x = "", y = "Attendance Rate (%)") +
  ylim(0, 70) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"),
        axis.text.x = element_text(angle = 15, hjust = 1))
```

---

# Summary Statistics Table

```{r summary_table}
summary_data <- data.frame(
  Indicator = c(
    "Total households (millions)",
    "Average family size (Italy)",
    "Average family size (Lombardy)",
    "Average family size (North)",
    "Average family size (South)",
    "Female employment rate - Italy (%)",
    "Female employment rate - Lombardy (%)",
    "Female employment rate - North (%)",
    "Female employment rate - South (%)",
    "Working mothers share - Italy (%)",
    "Working mothers share - Sud (%)",
    "Pre-school attendance - Italy (%)",
    "Pre-school attendance - Lombardy (%)",
    "Children (0-6) per family (with children)"
  ),
  Value = c(
    round(total_households/1e6, 2),
    round(weighted_avg, 2),
    round(weighted_avg_lom, 2),
    round(macro_stats$avg_size[macro_stats$macro_area == "North"], 2),
    round(macro_stats$avg_size[macro_stats$macro_area == "South"], 2),
    round(emp_rate_italy, 2),
    round(emp_rate_lom, 2),
    round(mean(emp_by_rip$rate[emp_by_rip$rip %in% c(1,2)]), 2),
    round(mean(emp_by_rip$rate[emp_by_rip$rip %in% c(4,5)]), 2),
    round(mothers_share_italy, 2),
    round(mothers_by_rip$share[mothers_by_rip$rip == 4], 2),
    round(attendance_rate, 2),
    round(lom_attendance, 2),
    round(weighted_avg_children, 2)
  )
)

kable(summary_data, caption = "Key Indicators Summary (Weighted)")
```

---

*Report generated: `r Sys.Date()`*

*All analyses use proper population weights (coefin/10,000)*


# Points of Discussion

1. The AVQ survey doesn't have any questions regarding religious stance or conservativeness. I was thinking that maybe this can be proxied with Euroskepticism?

  a. There is a question asking about 'how much do you believe in: European Parliament?'
  
  b. There are also several questions asking the perception of several disorders on their community, such as:
  
  - How safe do you feel walking on the street when it's dark and you're alone in the area where you live?
  
  - In the area where you live, how often do you see: people using drugs
  
  - In the area where you live, how often do you see: people dealing drugs
  
  - In the area where you live, how often do you see: prostitutes looking for clients
  
  - In the area where you live, how often do you see: acts of vandalism against public property (broken phone booths, trash bins, etc.)
  
  - In the area where you live, how often do you see: vagrants, people with no fixed address  - 