Double Checking FASTQs - Daniel E. Cook

When you have performed a sequencing project, quality control is one of the first things you will need to do. Unfortunately, sample mix-ups and other issues can and do happen. Systematic biases can also occur by machine and lane.

This script will extracting basic information from a set of FASTQs and output it to summary file (fastq_summary.txt). This will work with demultiplexed FASTQs generated by Illumina machines that appear in the following format:

@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1

@HWI-EAS209_0006_FC706VJ – Machine name
5 – lane
58 – tile within flowcell lane
5894 – x coordinate of cluster within tile
21141 – y coordinate of cluster within tile
#ATCACG – index
/1 – member of pair (/1 or /2)

The script below will extract the machine name, lane, index, and pair.

#!/usr/bin/python
import re
from itertools import groupby as g
import subprocess
import sys
from collections import OrderedDict

def most_common(L):
  return max(g(sorted(L)), key=lambda(x, v):(len(list(v)),-L.index(x)))[0]

# Set this variable to ensure no quality score lines get examined.
fq_at_start = "HWI"

r = subprocess.check_output("""
for r in `ls *fastq.gz`; 
do
echo "$r" 
gunzip -cq $r | head -n 1000 | grep '^@%s' - | grep -v '^@@' |  egrep '(:.+){4}' -
echo "|" 
done;
""" % fq_at_start, shell=True)


f = open("fastq_summary.txt", "w")
 

orig_line =  OrderedDict([("file", ""),
      ("instrument", ""),
      ("flowcell_id", ""),
      ("flowcell_lane", ""),
      ("x_coord", ""),
      ("y_coord", ""),
      ("index", ""),
      ("pair", ""),
      ("run_id", ""),
      ("filtered",""),
      ("control_bits","")])
l_keys = orig_line.keys()

f.write('\t'.join(l_keys) + "\n")
 
for fq_group in [filter(len,x.split('\n')) for x in r.split("|")][:-1]:
  index_set = []
  for heading in fq_group[1:]:
    l = re.split(r'(\:|#| )',heading)
    line = orig_line
    line["file"] = fq_group[0]
                if l[0].startswith("@SRR"):
                    line["run_id"] = l[0]
                    line["instrument"] = l[2]
                    line["flowcell_lane"] = l[4]
                    index_set.append("")
    elif len(l) == 11:
      line["instrument"] = l[0]
      line["flowcell_lane"] = l[2]
      line["flowcell_tile"] = l[4]
      line["x_coord"] = l[6]
      line["y_coord"] = l[8]
      try:
        line["pair"] = l[10].split("/")[1]
        index_set.append(l[10].split("/")[0])
      except:
        break
    elif len(l) == 21:
      line["instrument"] = l[0]
      line["run_id"] = l[2]
      line["flowcell_id"] = l[4]
      line["flowcell_lane"] = l[6]
      line["flowcell_tile"] = l[8]
      line["x_coord"] = l[10]
      line["y_coord"] = l[12]
      line["pair"] = l[14]
      line["filtered"] = l[16]
      line["control_bits"] = l[16]
      line["index"] = l[20]
      index_set.append(l[20])
    else:
      print "error", l
  line["index"] = most_common(index_set)
  f.write('\t'.join([line[x] for x in l_keys]+ ["\n"]))