I always use gtf file and retrieve gene information. There isn’t a highly flexible tool to solve my demand. I modified the code from “https://github.com/Jverma/GFF-Parser”, thanks Jverma. This tool will be easier to use.
Usage
Basically, there are three parameters.
id: either transcript id or gene id.
attType: attribute defined in gtf file. E.g. feature (column 3), gene_name, exon_number, transcript_id in column 9
attValue: the attribute value you want to search for.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
>>> import sys >>> from gtfParser import gtfParser
>>> gtf = gtfParser("example.gtf")
>>> # Get all exons in CDK7 >>> gtf.getRecordsByID("CDK7", "feature", "exon")
>>> # Get all features of transcript_id defined as "NM_001324069" in gene "CDK7" >>> gtf.getRecordsByID("CDK7", "transcript_id", "NM_001324069")
>>> # Get start codon where feature was defined as "start_codon" in transcript "NM_001324069" >>> gtf.getRecordsByID("NM_001324069", "feature", "start_codon")
>>> # Get a exon where its id is "NM_001324078.1" in "NM_001324078" transcript >>> gtf.getRecordsByID("NM_001324078", "exon_id", "NM_001324078.1")
Example gtf
Here is an simple example of gtf file. You can use to test. A subset from refSeq.hg38.gtf.
for line inopen(input_file): if line.startswith("#"): continue record = line.strip().split("\t") sequence_name = record[0] source = record[1] feature = record[2] start = int(record[3]) end = int(record[4]) if (record[5] != '.'): score = record[5] else: score = None strand = record[6] if (record[7] != '.'): frame = record[7] else: frame = None attributes = record[8].split(';') # please note the separator between each attribute attributes = [x.strip() for x in attributes[0:-1]] # ZZX attributes = {x.split(' ')[0]: x.split(' ')[1].strip("\"") for x in attributes if" "in x} # ZZX
ifnot (sequence_name in self.data): self.data[sequence_name] = [] alpha = {'source': source, 'feature': feature, 'start': start, 'end': end, 'score': score, 'strand': strand, 'frame': frame} # python 3 .items(), python 2 .iteritems() ZZX for k, v in attributes.items(): alpha[k] = v self.data[sequence_name].append(alpha)
# ZZX for k, v in self.data.items(): for alpha in v: gene_id = alpha["gene_id"] transcript_id = alpha["transcript_id"]
self.transcriptID_geneID_dict[transcript_id] = gene_id if gene_id in self.gene_attributes_dict.keys(): self.gene_attributes_dict[gene_id].append(alpha) else: self.gene_attributes_dict[gene_id] = list() self.gene_attributes_dict[gene_id].append(alpha)
defgetRecordsByID(self, id, attType, attValue): """ Gets all the features for a given gene :param id: Identifier of the gene (specified by gene_id) or mRNA (specified by transcript_id) :param attType: Any attribute list in gtf file. :param attValue: :return: A list of features related to ID where restricted by attType and attValue. """
att_list = [] ifidin self.gene_attributes_dict.keys(): for x in self.gene_attributes_dict[id]: if ( attType in x.keys() and x[attType] == attValue ): att_info = x att_list.append(att_info) elifidin self.transcriptID_geneID_dict.keys(): for x in self.gene_attributes_dict[self.transcriptID_geneID_dict[id]]: if ( attType in x.keys() and x[attType] == attValue and x["transcript_id"] == id): att_info = x att_list.append(att_info) else: sys.stderr.write("Could not find ID "+id+'\n') sys.exit(1) return att_list