# Study Title: # Date: # Name: # self-deposit data_checks-template version: 1.0 # This template contains commands that proved useful for data checks. This by no # means indicates that all necessary checks are part of this Do-File. # all elements enclosed by "#" are placeholders and have to be replaced to make the code run # all "#" in the code have to be deleted # This script is licensed under a Creative Commons Attribution 4.0 International License # (CC BY 4.0, https://creativecommons.org/licenses/by/4.0/) # Suggested citation: ## Hirsch, Lisa (2025). Template for Data Checks. R-Script. Vienna: The Austrian Social Science Data Archive. ################################################################################ ### load any packages you need ### ## run these lines if this is the first time you run this script #install.packages("haven") #install.packages("labelled") #install.packages("tidyverse") #install.packages("readr") #install.packages("foreign") library(haven) library(labelled) library(tidyverse) library(readr) library(foreign) ################################################################################ ### define global variables ### dataname <- "#data_4_publication_v2_wide.extension#" # name of the original dataset datadir <- "#C:/Users/MyName/Myproject/Mydata#" # directory of the data. if you copy the path, be aware that R needs forward slashes! ################################################################################ ### Import data ### # Select the option that fits your data! ## Import R file setwd(datadir) # change to SIP folder data <- load(dataname) # open data file ## Import csv file # this will not be labelled setwd(datadir) # change to SIP folder data <- read_csv(dataname) # open data file of SIP folder ## Import STATA file setwd(datadir) # change to folder data <- read_dta(dataname) # open data file ################################################################################ ### Data & Documentation Checks ### ################################################################################ ## descriptive information on the dataset and variables dim(data) # n obs of k variables ## check ID variable # an ID must uniquely identify observations colnames(data) # find id variable identical(data %>% distinct(#id-var#) %>% # replace "id-var" with the varname of the ID var, returns true nrow(),nrow(data)) # returns true if the id variable uniquely identifies all obs ## check for duplicates # check whether duplicates are indeed correct or a mistake data %>% duplicated() %>% # T if duplicated sum # should be 0 if there are no duplicates ## look for unlabeled variables # all variables should be labelled. # if there are unlabelled variables, label them # using var_label(data$var) <- "XX" var_label(data) # gives a list of all variable labels - shows null if it does not exist which(mapply(is.null, var_label(data), SIMPLIFY = TRUE)) # shows a subset of variables without labels look_for(data) # shows all variables and labels and value labels as well ## check all numeric variables and their labels # cross-check with both the questionnaire and the codebook # the information must be the same! # please also check for typos and spelling mistakes # check for any variables that contain sociodemographic information # check if there are less than 20 observations per category (re-identication risk!) lapply( data %>% select(where(is.numeric)), FUN = table ) ## check all string variables # cross-check this with the questionnaire and the codebook. # the information should be the same! # please also check for typos and spelling mistakes # check for any variables that contain sociodemographic information # check if there are less than 20 observations per category (re-identication risk!) lapply( data %>% select(where(is.character)), FUN = table ) ################################################################################ ### Anonymization checks ### ############################################################################### ## check all string variables for potentially identifiying information lapply( data %>% select(where(is.character)), FUN = table ) ## room for anonymisation ################################################################################ ### Plausibility checks ### ################################################################################ ## here, you can perform plausibility checks by crosstabbing varaibles ################################################################################ ### doi, version, saving ### ################################################################################ ## add DOI / VERSION to the beginning of each dataset data$version <- "#1.0 (202#Y#-#MM#-#DD#)#" # add year, month, day; always use small letter in variable name var_label(data$version) <- "AUSSDA archive version" data$doi <- "doi:10.11587/######" # add doi var_label(data$doi) <- "digital object identifier" data <- data %>% select(version, doi, everything()) ## save the dataset under the standard filename pattern "DOIsuffix_da_language_version.ext" # if the language of your filename is German, replace "en" with "de" setwd(datadir) saveRDS(data, "######_da_en_v1_0", row.names = T) # replace ###### with DOI suffix ## Save this as csv for and upload is as well # if the language of your filename is German, replace "en" with "de" setwd(datadir) write.csv(data, "######_da_en_v1_0", row.names = T) # replace ###### with DOI suffix