# Study Title: 	
# Date: 		
# Name: 

# self-deposit data_checks-template version: 1.0
# This template contains commands that proved useful for data checks. This by no 
# means indicates that all necessary checks are part of this Do-File.

# all elements enclosed by "#" are placeholders and have to be replaced to make the code run
# all "#" in the code have to be deleted
 
# This script is licensed under a Creative Commons Attribution 4.0 International License
# (CC BY 4.0, https://creativecommons.org/licenses/by/4.0/)
# Suggested citation: 
## Hirsch, Lisa (2025). Template for Data Checks. R-Script. Vienna: The Austrian Social Science Data Archive.

################################################################################
### load any packages you need ###

## run these lines if this is the first time you run this script
#install.packages("haven")
#install.packages("labelled")
#install.packages("tidyverse")
#install.packages("readr")
#install.packages("foreign")

library(haven)
library(labelled)
library(tidyverse)
library(readr)
library(foreign)
  
################################################################################
### define global variables ###

dataname <- "#data_4_publication_v2_wide.extension#"	 # name of the original dataset
datadir <- "#C:/Users/MyName/Myproject/Mydata#"		# directory of the data. if you copy the path, be aware that R needs forward slashes! 

################################################################################
### Import data ###
# Select the option that fits your data!

## Import R file
setwd(datadir)																                        # change to SIP folder
data <- load(dataname)													                      # open data file 

## Import csv file 
# this will not be labelled
setwd(datadir)																                        # change to SIP folder
data <- read_csv(dataname)													                  # open data file of SIP folder

## Import STATA file
setwd(datadir)																                        # change to  folder
data <- read_dta(dataname)									                          # open data file

################################################################################
				### Data & Documentation Checks ###
################################################################################

## descriptive information on the dataset and variables
dim(data) # n obs of k variables

## check ID variable
# an ID must uniquely identify observations 
colnames(data) # find id variable
identical(data %>%
  distinct(#id-var#)  %>% # replace "id-var" with the varname of the ID var, returns true
  nrow(),nrow(data)) # returns true if the id variable uniquely identifies all obs

## check for duplicates
# check whether duplicates are indeed correct or a mistake
data %>% 
  duplicated() %>% # T if duplicated
  sum # should be 0 if there are no duplicates
  
## look for unlabeled variables
# all variables should be labelled. 
# if there are unlabelled variables, label them
# using var_label(data$var) <- "XX"
var_label(data) # gives a list of all variable labels - shows null if it does not exist
which(mapply(is.null, var_label(data), SIMPLIFY = TRUE)) # shows a subset of variables without labels

look_for(data) # shows all variables and labels and value labels as well

## check all numeric variables and their labels
# cross-check with both the questionnaire and the codebook
# the information must be the same!
# please also check for typos and spelling mistakes
# check for any variables that contain sociodemographic information 
# check if there are less than 20 observations per category (re-identication risk!)
lapply(
  data %>%
    select(where(is.numeric)),
  FUN = table
)

## check all string variables
# cross-check this with the questionnaire and the codebook. 
# the information should be the same!
# please also check for typos and spelling mistakes
# check for any variables that contain sociodemographic information 
# check if there are less than 20 observations per category (re-identication risk!)
lapply(
  data %>%
    select(where(is.character)),
  FUN = table
)


################################################################################
### Anonymization checks ###
###############################################################################
  
## check all string variables for potentially identifiying information
lapply(
  data %>%
    select(where(is.character)),
  FUN = table
)

## room for anonymisation

################################################################################
### Plausibility checks ###
################################################################################

## here, you can perform plausibility checks by crosstabbing varaibles


################################################################################
### doi, version, saving ###
################################################################################
  
## add DOI / VERSION to the beginning of each dataset
data$version  <- "#1.0 (202#Y#-#MM#-#DD#)#" 										# add year, month, day; always use small letter in variable name
var_label(data$version) <- "AUSSDA archive version"

data$doi  <- "doi:10.11587/######"												# add doi
var_label(data$doi) <- "digital object identifier"

data <- data %>%
  select(version, doi, everything())

## save the dataset under the standard filename pattern "DOIsuffix_da_language_version.ext" 
# if the language of your filename is German, replace "en" with "de"
setwd(datadir)
saveRDS(data, "######_da_en_v1_0", row.names = T) # replace ###### with DOI suffix

## Save this as csv for and upload is as well
# if the language of your filename is German, replace "en" with "de"
setwd(datadir)
write.csv(data, "######_da_en_v1_0", row.names = T) # replace ###### with DOI suffix