The qc command performs read-level quality control at
the BAM level and generates the input tables required for downstream
visualization with qc_visualization().
This step is implemented entirely in bash and is designed to be fast, reproducible, and independent of R. It computes sequencing depth for each CRF-CRF pair and applies a percentile-based filtering strategy to remove low-coverage pairs prior to visualization or modeling.
What this script does:
samtools idxstats to compute total mapped read
counts for each BAM file.all_qc.tsv) with
columns:
pair: CRF-CRF pair name (derived from BAM
filename)read_count: total mapped readsfiltered_qc.tsv.| Argument | Type | Default | Description | Example |
|---|---|---|---|---|
-i, –input
|
character | — | Directory containing input BAM files (one per CRF pair) |
-i ./bam
|
-o, –output
|
character | — | Output directory for QC tables |
-o ./qc
|
-p, –percentile
|
numeric |
0.25
|
Percentile threshold used to filter low-read-count CRF pairs |
-p 0.1
|
-e, –exclude
|
character |
unknown,IgG_control
|
Comma-separated list of CRF names to exclude. Exclusion is performed
before read counting.
|
-e unknown
|
The command generates the following output files in the specified
out_dir:
all_qc.tsv
| pair | read_count | |
|---|---|---|
| <chr> | <int> | |
| 1 | H3K27me3-H3K4me1 | 2207658 |
| 2 | H3K4me3-H3K4me3 | 1863056 |
| 3 | H3K27me3-H3K9me3 | 1763720 |
| 4 | H3K9me3-H3K9me3 | 1593466 |
| 5 | H3K27me3-H3K4me3 | 1139048 |
| 6 | H3K4me1-H3K4me3 | 856052 |
| 7 | H3K4me3-H3K9me3 | 821540 |
| 8 | H3K4me1-H3K9me3 | 603640 |
| 9 | H3K4me1-H3K4me1 | 567902 |
| 10 | H3K9me2-H3K9me3 | 394360 |
| 11 | H3K27me3-H3K9me2 | 373104 |
| 12 | H3K9me2-H3K9me2 | 222676 |
| 13 | H3K4me3-H3K9me2 | 159988 |
| 14 | H3K27ac-H3K4me3 | 120056 |
| 15 | H3K27ac-H3K27me3 | 85816 |
filtered_qc.tsv
filtered_percentile parameter| pair | read_count | |
|---|---|---|
| <chr> | <int> | |
| 1 | H3K27me3-H3K4me1 | 2207658 |
| 2 | H3K4me3-H3K4me3 | 1863056 |
| 3 | H3K27me3-H3K9me3 | 1763720 |
| 4 | H3K9me3-H3K9me3 | 1593466 |
| 5 | H3K27me3-H3K4me3 | 1139048 |
| 6 | H3K4me1-H3K4me3 | 856052 |
| 7 | H3K4me3-H3K9me3 | 821540 |
| 8 | H3K4me1-H3K9me3 | 603640 |
| 9 | H3K4me1-H3K4me1 | 567902 |
| 10 | H3K9me2-H3K9me3 | 394360 |
| 11 | H3K27me3-H3K9me2 | 373104 |
# Test Data
BAM_DIR="./bam"
QC_DIR="./qc"
for d in "$BAM_DIR"/*/; do
[[ -d "$d" ]] || continue
sample="$(basename "$d")"
echo "======================"
echo "$sample"
echo "======================"
out="${QC_DIR}/${sample}"
multiEpiPrep qc -i "$d" -o "$out" -e unknown,IgG_control
echo ""
done
# Move unpassed files to unpassed/
for d in "$BAM_DIR"/*/; do
[[ -d "$d" ]] || continue
sample="$(basename "$d")"
tsv="${QC_DIR}/${sample}/filtered_read_count.tsv"
echo "======================"
echo "$sample"
echo "======================"
# create unpassed dir
unpassed_dir="${d}/unpassed"
mkdir -p "$unpassed_dir"
pairs=$(awk -F'\t' '
NR==1 {
for (i=1; i<=NF; i++) if ($i=="pair") col=i
if (!col) { print "ERROR: pair column not found" > "/dev/stderr"; exit 1 }
next
}
$col != "" { print $col }
' "$tsv")
for bam in "$d"/*.bam; do
[[ -e "$bam" ]] || continue
pair="$(basename "$bam" .bam)"
if ! printf '%s\n' "$pairs" | grep -Fxq -- "$pair"; then
echo "Moving $bam -> $unpassed_dir"
mv "$bam" "$unpassed_dir/"
# move index if exists
[[ -e "${bam}.bai" ]] && mv "${bam}.bai" "$unpassed_dir/"
fi
done
done