normalised-extract-exon-guides.py

'''
A Benchmark of Computational CRISPR-Cas9 Guide Design Methods

Jacob Bradford, Dimitri Perrin. 2019.

This script reads the normalised data (generated by normalise.py) and extracts
guides that target exons. 

Notes:
    - Make sure you create a directory named according to "OUTPUT_DIR" in every
        directory specified in "scannableDirs"

Run:
    1:  (normalise the raw data first, using: normalise.py)
    2:  normalised-extract-exon-guides.py
    
Output:
    - Guides which target exon regions, in the normalised format

'''

import argparse, os, re

# this script requires the datasets generated by normalise.py 
# it extracts all guides that are within the exon regions, as per the rules from mm10db
# ie: within the bounds of the exon region, plus-minus 17 bases

mm10dbExonListFiles = {
    '500k' :        r"exon_list_500k.txt",
    '1m' :          r"exon_list_1m.txt",
    '5m' :          r"exon_list_5m.txt",
    #'full' :       r"exon_list_full.txt",
}

humanToIntSizes = {
    '500k' : 500000,
    '1m' : 1000000,
    '5m' : 5000000,
    'full' : 61431566,
}

OUTPUT_DIR = 'exon-only'

toolsToConsider = [
    'casdesigner',
    'casfinder',
    'cctop',
    'chopchop',
    'crispor',
    'crisprdo',
    'crisprera',
    'ctfinder',
    'flashfry',
    'gtscan',
    'guidescan',
    'mm10db',
    'phytocrispex',
    'sgrnacas9',
    'sgrnascorer',
    'ssc',
    'wucrispr',
    'tuscan'
]

scannableDirs = {
    '500k'  :   r'normalised/500k',
    '1m'    :   r'normalised/1m',
    '5m'    :   r'normalised/5m',
    'full'  :   r'normalised/full',
}

def doesFileNameMatchToolsToConsider(fileName):
    matches = False
    for toolToConsider in toolsToConsider:
        if toolToConsider.lower() in fileName.replace('-', '').lower():
            matches = True
    return matches
 

def createExonHashTable(size):
    annotationFile = mm10dbExonListFiles[size]
    hashTable = {'+' : [], '-' : []}
    with open(annotationFile, 'r') as fOpen:
        fLines = fOpen.read().split('\n')[:-1]
        for line in fLines:
            lineSplit = line.split('\t')
            strand = lineSplit[2]
            exonRegionStart = int(lineSplit[3])
            exonRegionEnd = int(lineSplit[4])
            hashTable[strand].append([exonRegionStart, exonRegionEnd])

    return hashTable
    
def isGuideWithinExon(strStrand, guidePos, exonHashTable):
    for strand in exonHashTable:
        for exon in exonHashTable[strand]:
            #print '%s %s %s' % (guidePos, exon[0], exon[1])
            if guidePos >= (exon[0] - 17) and guidePos <= (exon[1] + 17):
                return True
    return False


for scannableDir in scannableDirs:
    exonHashTable = createExonHashTable(scannableDir)    
    print '\nProcessing: %s' % scannableDir

    # walk over the directory
    for (dirpath, dirnames, filenames) in os.walk(scannableDirs[scannableDir]):
                        
        # for file within the directory
        for currentFilename in filenames:
            if doesFileNameMatchToolsToConsider(currentFilename) == False:
                continue
                
            currentFullpath = os.path.join(scannableDirs[scannableDir], currentFilename)
            with open(currentFullpath, 'r') as fCurrentRead:
                newFullpath = os.path.join(OUTPUT_DIR, scannableDir, currentFilename)
                with open(newFullpath, 'w+') as fWrite:
                    for currentLineRaw in fCurrentRead:
                        currentLine = currentLineRaw.strip('\n').split(',')
                        currToolName = currentLine[0]
                        currSeq = currentLine[1]
                        currStartPos = int(currentLine[2])
                        currEndPos = int(currentLine[3])
                        currStrand = currentLine[4]
                        
                        if currStrand == '+':
                            hashKey = currEndPos
                        else:
                            hashKey = currStartPos

                        if isGuideWithinExon(currStrand, hashKey, exonHashTable):
                            fWrite.write('%s' % (currentLineRaw))