SQL Queries With NHANES Data

Skills

Task

Complete the following questions using SQL.

Data

NHANES data containing demographic information for respondents can be found here. NHANES data containing additional information on respondents, such as triglyceride and cholesterol levels, can be found here. Relevant information on the variables is included below:

Gender - Gender of respondents

1 - Male
2 - Female

Age_in_years_at_screening = Age at screening in years (continuous up to 79)

0-79 - Any age between 0 and 79
80 - 80 or older

Race_Hispanic_origin_w_NH_Asian = Race/Hispanic origin

1 - Mexican American
2 - Other Hispanic
3 - Non-Hispanic White
4 - Non-Hispanic Black
6 - Non-Hispanic Asian
7 - Other race, including multi-racial
NA - Missing

Marital_status = Marital status

1 - Married
2 - Widowed
3 - Divorced
4 - Separated
5 - Never married
6 - Living with partner
77 - Refused
99 - Don’t know
NA - Missing

Annual_household_income = Annual household income

1 - $0 to $4,999
2 - $5,000 to $9,999
3 - $10,000 to $14,999
4 - $15,000 to $19,999
5 - $20,000 to $24,999
6 - $25,000 to $34,999
7 - $35,000 to $44,999
8 - $45,000 to $54,999
9 - $55,000 to $64,999
10 - $65,000 to $74,999
11 - $20,000 and over
12 - Under $20,000
14 - $75,000 to $99,999
15 - $100,000 and over
77 - Refused
99 - Don’t know
NA - Missing

Pregnancy_status_at_exam = Pregnancy status at exam

1 - Yes
2 - No
3 - Cannot ascertain
NA - Missing

Education_level_Adults = Eucation level

1 - Less than 9th grade
2 - 9-11th grade (Includes 12th grade with no diploma)
3 - High school graduate/GED or equivalent
4 - Some college or AA degree
5 - College graduate or above
7 - Refused
9 - Don’t know
NA - Missing

I first import and merge the datasets before running any queries.

demo <- read.csv('data/NHANES_Demographics.csv')
tri <- read.csv('data/NHANES_Triglycerides.csv')

merged <- sqldf(
"SELECT *
FROM demo AS d
LEFT JOIN tri AS t
ON d.Respondent_sequence_number = t.Respondent_sequence_number"
)

knitr::kable(head(merged[1:5]))

Respondent_sequence_number	Gender	Age_in_years_at_screening	Race_Hispanic_origin_w_NH_Asian	Education_level_Adults
62161	1	22	3	3
62162	2	3	1	NA
62163	1	14	6	NA
62164	2	44	3	4
62165	2	14	4	NA
62166	1	9	3	NA

knitr::kable(head(merged[6:10]))

Marital_status	Pregnancy_status_at_exam	Annual_household_income	Annual_family_income	Respondent_sequence_number..10
5	NA	14	14	62161
NA	NA	4	4	NA
NA	NA	15	15	NA
1	2	8	8	62164
NA	NA	4	4	62165
NA	NA	77	77	NA

knitr::kable(head(merged[11:14]))

Triglyceride_mg_dL	Triglyceride_mmol_L	LDL_cholesterol_mg_dL	LDL_cholesterol_mmol_L
84	0.948	110	2.845
NA	NA	NA	NA
NA	NA	NA	NA
56	0.632	151	3.905
71	0.802	84	2.172
NA	NA	NA	NA

I rename the variables in the merged dataset and remove the duplicate ID column.

full_data <- sqldf(
"SELECT Respondent_sequence_number      AS id, 
        Gender                          AS gender,
        Age_in_years_at_screening       AS age, 
        Race_Hispanic_origin_w_NH_Asian AS race,
        Education_level_Adults          AS educ, 
        Marital_status                  AS marital_status,
        Pregnancy_status_at_exam        AS pregnancy_status,      
        Annual_household_income         AS household_income,
        Annual_family_income            AS family_income,
        Triglyceride_mg_dL              AS tri_mg_dl,
        Triglyceride_mmol_L             AS tri_mmol_l,
        LDL_cholesterol_mg_dL           AS chol_mg_ldl,
        LDL_cholesterol_mmol_L          AS chol_mmol_l
FROM merged"
)

Queries

Q1

Write a query that gives the frequency of each race, as well as the average age of participants by race.

race_freq <- sqldf(
"SELECT race, COUNT(race) AS freq, ROUND(AVG(age),1) AS mean_age
    FROM full_data 
GROUP BY race"
)

knitr::kable(head(race_freq))

race	freq	mean_age
1	1355	22.5
2	1076	30.4
3	2973	37.9
4	2683	30.5
6	1282	31.5
7	387	21.8

Q2

Show the distribution of race by gender and display all unique race-gender combinations from highest to lowest frequency.

race_freq2 <- sqldf(
"SELECT gender, race, COUNT(race) AS freq
    FROM full_data 
GROUP BY race, gender 
ORDER BY freq DESC"
)

knitr::kable(head(race_freq2))

gender	race	freq
1	3	1508
2	3	1465
2	4	1372
1	4	1311
1	1	694
2	1	661

Q3

How many women were pregnant at the time of screening?

preg_freq <- sqldf(
"SELECT COUNT(pregnancy_status) AS number_pregnant
    FROM full_data 
GROUP BY pregnancy_status
HAVING pregnancy_status = 1"
)

preg_freq

##   number_pregnant
## 1              57

Q4

How many men refused to provide their annual household income?

refuse_income <- sqldf(
"SELECT gender, COUNT(household_income) AS num_refused
    FROM full_data 
WHERE gender = 1 and household_income = 77")

knitr::kable(head(refuse_income))

gender	num_refused
1	134

Q5

Using the two separate datasets, find the mean LDL level (mg/dL) for each gender.

mean_chol <- sqldf(
"SELECT gender, ROUND(AVG(LDL_cholesterol_mg_dL), 1) AS mean_chol
    FROM demo AS d
      INNER JOIN tri AS t
      ON d.Respondent_sequence_number = t.Respondent_sequence_number
GROUP BY Gender")

mean_chol

##   Gender mean_chol
## 1      1     107.8
## 2      2     111.1

Q6

Using the two separate datasets, find the minimum and maximum triglyceride levels (mmol/L) for each race.

minmax <- sqldf(
"SELECT Race_Hispanic_origin_w_NH_Asian AS race, MIN(Triglyceride_mmol_L) AS min, MAX(Triglyceride_mmol_L) AS max
    FROM demo AS d
      INNER JOIN tri AS t
      ON d.Respondent_sequence_number = t.Respondent_sequence_number
GROUP BY race")

knitr::kable(head(minmax))

race	min	max
1	0.339	11.053
2	0.339	7.768
3	0.361	17.635
4	0.203	6.040
6	0.237	6.593
7	0.294	4.301

Q7

How many participants had an annual household income of $55,000 or more? Break this down by gender and order the results in descending frequency.

high_income <- sqldf(
"SELECT gender, COUNT(household_income) AS freq
    FROM full_data 
WHERE household_income in (9, 10, 14, 15)
GROUP BY gender 
ORDER BY freq DESC"
)

knitr::kable(head(high_income))

gender	freq
1	1588
2	1541