Return Column and Row Names of Samples and Probes under the Missingness Theshold

MarkMissing(dnaM_df, sampMissing_p = 0.5, probeMissing_p = 0.25)

Arguments

dnaM_df: A data frame of DNA methylation values. Samples are columns. Row names are probe IDs.
sampMissing_p: The maximum proportion of missingness allowed in a sample. Defaults to 50%.
probeMissing_p: The maximum proportion of missingness allowed in a probe. Defaults to 25%.

Value

A list of four entries:

dropSamples: the column names of samples with more than sampMissing_p percent missing values
keepSamples: the column names of samples with less than or equal to sampMissing_p percent missing values
dropProbes: the row names of probes with more than probeMissing_p percent missing values
keepProbes: the row names of probes with less than or equal to probeMissing_p percent missing values

Details

Before calculating the missing proportion of samples, probes with missingness greater than the threshold are dropped first.

Examples


  ###  Setup  ###
  values_num <- c(
    0.1, 0.1, 0.1, 0.1, 0.1,
    0.1, 0.1, 0.1, 0.1,  NA,
    0.1, 0.1, 0.1, 0.1,  NA,
    0.1, 0.1, 0.1,  NA,  NA,
    0.1, 0.1, 0.1,  NA,  NA,
    0.1, 0.1,  NA,  NA,  NA,
    0.1, 0.1,  NA,  NA,  NA,
    0.1,  NA,  NA,  NA,  NA,
     NA,  NA,  NA,  NA,  NA
  )
  values_mat <- matrix(values_num, nrow = 9, ncol = 5, byrow = TRUE)
  rownames(values_mat) <- paste0("probe_0", 1:9)
  colnames(values_mat) <- paste0("sample_0", 1:5)
  values_df <- as.data.frame(values_mat)
  
  
  ###  Simple Calculations  ###
  MarkMissing(values_df)
#> $dropSamples
#> [1] "sample_05"
#> 
#> $keepSamples
#> [1] "sample_01" "sample_02" "sample_03" "sample_04"
#> 
#> $dropProbes
#> [1] "probe_04" "probe_05" "probe_06" "probe_07" "probe_08" "probe_09"
#> 
#> $keepProbes
#> [1] "probe_01" "probe_02" "probe_03"
#> 
  MarkMissing(values_df, probeMissing_p = 0.5)
#> $dropSamples
#> [1] "sample_05"
#> 
#> $keepSamples
#> [1] "sample_01" "sample_02" "sample_03" "sample_04"
#> 
#> $dropProbes
#> [1] "probe_06" "probe_07" "probe_08" "probe_09"
#> 
#> $keepProbes
#> [1] "probe_01" "probe_02" "probe_03" "probe_04" "probe_05"
#> 
  MarkMissing(values_df, sampMissing_p = 0.25)
#> $dropSamples
#> [1] "sample_05"
#> 
#> $keepSamples
#> [1] "sample_01" "sample_02" "sample_03" "sample_04"
#> 
#> $dropProbes
#> [1] "probe_04" "probe_05" "probe_06" "probe_07" "probe_08" "probe_09"
#> 
#> $keepProbes
#> [1] "probe_01" "probe_02" "probe_03"
#> 
  
  
  ###  Using the Output  ###
  mark_ls <- MarkMissing(values_df, probeMissing_p = 0.5)
  valuesPurged_df <- values_df[ mark_ls$keepProbes, mark_ls$keepSamples ]
  valuesPurged_df
#>          sample_01 sample_02 sample_03 sample_04
#> probe_01       0.1       0.1       0.1       0.1
#> probe_02       0.1       0.1       0.1       0.1
#> probe_03       0.1       0.1       0.1       0.1
#> probe_04       0.1       0.1       0.1        NA
#> probe_05       0.1       0.1       0.1        NA