说明
python alignmentparser示例是从最受好评的开源项目中提取的实现代码,你可以参考下面示例的使用方式。
编程语言: Python
命名空间/包名称: plastidutilscriptlibargparsers
示例#1文件:
make_wiggle.py项目:
joshuagryphon/plastid
def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:py:func:`main` is called directly.
Default: sys.argv[1:] (actually command-line arguments)
"""
ap = AlignmentParser()
bp = BaseParser()
parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[bp.get_parser(),ap.get_parser()])
parser.add_argument("-o","--out",dest="outbase",type=str,required=True,
metavar="FILENAME",
help="Base name for output files")
parser.add_argument("--window_size",default=100000,metavar="N",type=int,
help="Size of nucleotides to fetch at once for export. "+\
"Large values are faster but require more memory "+\
"(Default: 100000)")
track_opts = parser.add_argument_group(title="Browser track options")
track_opts.add_argument("--color",type=str,default=None,
help="An RGB hex string (`'#NNNNNN'`, `N` in `[0-9,A-F]`) specifying \
the track color.")
track_opts.add_argument("-t","--track_name",dest="track_name",type=str,
help="Name to give browser track",
default=None)
track_opts.add_argument("--output_format",choices=("bedgraph","variable_step"),
default="bedgraph",
help="Format of output file (Default: bedgraph)")
args = parser.parse_args(argv)
gnd = ap.get_genome_array_from_args(args,printer=printer)
bp.get_base_ops_from_args(args)
if args.track_name is None:
name = args.outbase
else:
name = args.track_name
if args.color is not None:
fw_color = rc_color = "%s,%s,%s" % tuple(get_rgb255(args.color))
else:
fw_color = rc_color = "0,0,0"
if args.output_format == "bedgraph":
outfn = gnd.to_bedgraph
elif args.output_format == "variable_step":
outfn = gnd.to_variable_step
track_fw = "%s_fw.wig" % args.outbase
track_rc = "%s_rc.wig" % args.outbase
with argsopener(track_fw,args,"w") as fw_out:
printer.write("Writing forward strand track to %s ..." % track_fw)
outfn(fw_out,"%s_fw" % name,"+",window_size=args.window_size,color=fw_color,
printer=printer)
fw_out.close()
with argsopener(track_rc,args,"w") as rc_out:
printer.write("Writing reverse strand track to %s ..." % track_rc)
outfn(rc_out,"%s_rc" % name,"-",window_size=args.window_size,color=rc_color,
printer=printer)
rc_out.close()
printer.write("Done!")
示例#2文件:
counts_in_region.py项目:
joshuagryphon/plastid
def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:func:`main` is called directly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
ap = AnnotationParser()
annotation_file_parser = ap.get_parser(conflict_handler="resolve")
al = AlignmentParser(disabled=_DISABLED)
alignment_file_parser = al.get_parser(conflict_handler="resolve")
mp = MaskParser()
mask_file_parser = mp.get_parser()
bp = BaseParser()
base_parser = bp.get_parser()
parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[base_parser,
alignment_file_parser,
annotation_file_parser,
mask_file_parser],
)
parser.add_argument("outfile",type=str,help="Output filename")
args = parser.parse_args(argv)
bp.get_base_ops_from_args(args)
ga = al.get_genome_array_from_args(args,printer=printer)
transcripts = ap.get_transcripts_from_args(args,printer=printer,return_type=SegmentChain)
crossmap = mp.get_genome_hash_from_args(args,printer=printer)
ga_sum = ga.sum()
normconst = 1000.0*1e6 / ga_sum
with argsopener(args.outfile,args,"w") as fout:
fout.write("## total_dataset_counts: %s\n" % ga_sum)
fout.write("region_name\tregion\tcounts\tcounts_per_nucleotide\trpkm\tlength\n")
for n,ivc in enumerate(transcripts):
name = ivc.get_name()
masks = crossmap.get_overlapping_features(ivc)
ivc.add_masks(*itertools.chain.from_iterable((X for X in masks)))
if n % 1000 == 0:
printer.write("Processed %s regions..." % n)
counts = numpy.nansum(ivc.get_masked_counts(ga))
length = ivc.masked_length
rpnt = numpy.nan if length == 0 else float(counts)/length
rpkm = numpy.nan if length == 0 else rpnt * normconst
ltmp = [name,
str(ivc),
"%.8e" % counts,
"%.8e" % rpnt,
"%.8e" % rpkm,
"%d" % length]
fout.write("%s\n" % "\t".join(ltmp))
fout.close()
printer.write("Processed %s regions total." % n)
printer.write("Done.")
示例#3文件:
psite.py项目:
joshuagryphon/plastid
def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:py:func:`main` is called directrly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
ap = AlignmentParser(allow_mapping=False,input_choices=["BAM"],
disabled=["normalize","big_genome",])
bp = BaseParser()
alignment_file_parser = ap.get_parser()
base_parser = bp.get_parser()
pp = PlottingParser()
plotting_parser = pp.get_parser()
parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
parents=[base_parser,
alignment_file_parser,
plotting_parser])
parser.add_argument("--min_counts",type=int,default=10,metavar="N",
help="Minimum counts required in normalization region "+
"to be included in metagene average (Default: 10)")
parser.add_argument("--normalize_over",type=int,nargs=2,metavar="N",
default=None,
#default=(20,50),
help="Portion of each window against which its individual raw count profile"+
" will be normalized. Specify two integers, in nucleotide"+
" distance from landmark (negative for upstream, positive for downstream. Surround negative numbers with quotes.). (Default: 20 50)")
parser.add_argument("--norm_region",type=int,nargs=2,metavar="N",
default=None,
help="Deprecated. Use ``--normalize_over`` instead. "+
"Formerly, Portion of each window against which its individual raw count profile"+
" will be normalized. Specify two integers, in nucleotide"+
" distance, from 5\' end of window. (Default: 70 100)")
parser.add_argument("--require_upstream",default=False,action="store_true",
help="If supplied, the P-site offset is taken to be the distance "+
"between the largest peak upstream of the start codon and "+
"the start codon itself. Otherwise, the P-site offset is taken "+
"to be the distance between the largest peak in the entire ROI "+
"and the start codon. Ignored if ``--constrain`` is used."
)
parser.add_argument("--constrain",type=int,nargs=2,default=None,metavar="X",
help="Constrain P-site offset to be between specified distance from "+
"start codon. Useful for noisy data. "+
"(Reasonable set: 10 15; default: not constrained)")
parser.add_argument("--aggregate",default=False,action="store_true",
help="Estimate P-site from aggregate reads at each position, instead "+
"of median normalized read density. Noisier, but helpful for "+
"lower-count data or read lengths with few counts. (Default: False)"
),
parser.add_argument("--keep",default=False,action="store_true",
help="Save intermediate count files. Useful for additional computations (Default: False)")
parser.add_argument("--default",type=int,default=13,
help="Default 5\' P-site offset for read lengths that are not present or evaluated in the dataset. Unaffected by ``--constrain`` (Default: 13)")
parser.add_argument("roi_file",type=str,
help="ROI file surrounding start codons, from ``metagene generate`` subprogram")
parser.add_argument("outbase",type=str,help="Basename for output files")
# set manual options
args = parser.parse_args(argv)
bp.get_base_ops_from_args(args)
# set defaults
args.mapping = "fiveprime"
args.offset = 0
args.nibble = 0
# process arguments
min_len = args.min_length
max_len = args.max_length
profiles = max_len + 1 - min_len
lengths = list(range(min_len,max_len+1))
outbase = args.outbase
title = "Fiveprime read offsets by length" if args.title is None else args.title
pp.set_style_from_args(args)
colors = pp.get_colors_from_args(args,profiles)
printer.write("Opening ROI file %s ..." % args.roi_file)
with opener(args.roi_file) as roi_fh:
roi_table = pd.read_table(roi_fh,sep="\t",comment="#",index_col=None,header=0)
roi_fh.close()
printer.write("Opening count files %s ..." % ",".join(args.count_files))
ga = ap.get_genome_array_from_args(args,printer=printer)
# remove default size filters
my_filters = ga._filters.keys()
for f in my_filters:
ga.remove_filter(f)
norm_start, norm_end = _get_norm_region(roi_table,args)
# count
count_dict, norm_count_dict, metagene_profile = do_count(roi_table,
ga,
norm_start,
norm_end,
args.min_counts,
min_len,
max_len,
aggregate=args.aggregate,
printer=printer)
# save counts
profile_fn = "%s_metagene_profiles.txt" % outbase
with argsopener(profile_fn,args,"w") as metagene_out:
metagene_profile.to_csv(metagene_out,
sep="\t",
header=True,
index=False,
na_rep="nan",
columns=["x"]+["%s-mers" % X for X in lengths])
metagene_out.close()
if args.keep == True:
printer.write("Saving raw and normalized counts ...")
for k in count_dict:
count_fn = "%s_%s_rawcounts.txt.gz" % (outbase,k)
normcount_fn = "%s_%s_normcounts.txt.gz" % (outbase,k)
mask_fn = "%s_%s_mask.txt.gz" % (outbase,k)
numpy.savetxt(count_fn,count_dict[k],delimiter="\t")
numpy.savetxt(normcount_fn,norm_count_dict[k],delimiter="\t")
numpy.savetxt(mask_fn,norm_count_dict[k].mask,delimiter="\t")
# plotting & offsets
printer.write("Plotting and determining offsets ...")
offset_dict = OrderedDict()
# Determine scaling factor for plotting metagene profiles
max_y = numpy.nan
with warnings.catch_warnings():
# ignore warnings for slices that contain only NaNs
warnings.simplefilter("ignore",category=RuntimeWarning)
for k in lengths:
max_y = numpy.nanmax([max_y,
numpy.nanmax(metagene_profile["%s-mers"% k].values)])
if numpy.isnan(max_y) or max_y == 0:
max_y = 1.0
# parse arguments & set styles
mplrc = matplotlib.rcParams
plt_incr = 1.2
# use this figsize if not specified on command line
figheight = 1.0 + 0.25*(profiles-1) + 0.75*(profiles)
default_figsize = (7.5,figheight)
fig = pp.get_figure_from_args(args,figsize=default_figsize)
ax = plt.gca()
plt.title(title)
plt.xlabel("Distance from CDS start, (nt; 5' end mapping)")
if args.aggregate == True:
plt.ylabel("Aggregate read counts (au)")
else:
plt.ylabel("Median normalized read density (au)")
plt.axvline(0.0,color=mplrc["axes.edgecolor"],dashes=[3,2])
x = metagene_profile["x"].values
xmin = x.min()
xmax = x.max()
if args.constrain is not None:
mask = numpy.tile(True,len(x))
zp = (x==0).argmax()
l,r = args.constrain
if l == r:
warnings.warn("Minimum and maximum distance constraints are equal (both '%s'). This is silly." % l,ArgumentWarning)
mindist = min(l,r)
maxdist = max(l,r)
mask[zp-maxdist:zp-mindist+1] = False
elif args.require_upstream == True:
mask = x >= 0
else:
mask = numpy.tile(False,len(x))
for n,k in enumerate(lengths):
color = colors[n]
baseline = plt_incr*n
y = metagene_profile["%s-mers" % k].values
#ymask = y[mask]
ymask = numpy.ma.MaskedArray(y,mask=mask)
if numpy.isnan(y).all():
plot_y = numpy.zeros_like(x)
else:
if args.aggregate == False:
plot_y = y / max_y
else:
plot_y = y.astype(float) / numpy.nanmax(y) * 0.9
# plot metagene profiles on common scale, offset by baseline from bottom to top
ax.plot(x,baseline + plot_y,color=color)
ax.text(xmin,baseline,"%s-mers" % k,
ha="left",
va="bottom",
color=color,
transform=matplotlib.transforms.offset_copy(ax.transData,fig,
x=6.0,y=3.0,units="points"))
ymax = baseline + numpy.nanmax(plot_y)
# if all valid positions are nan, or if all valid positions are <= 0
if (~mask).sum() == numpy.isnan(ymask).sum() or numpy.nanmax(ymask) == 0:
offset = args.default
usedefault = True
else:
offset = -x[numpy.ma.argmax(ymask)]
usedefault = False
offset_dict[k] = offset
if usedefault == False:
yadj = ymax - 0.2 * plt_incr
ax.plot([-offset,0],[yadj,yadj],color=color,dashes=[3,2])
ax.text(-offset / 2.0,
yadj,
"%s nt" % (offset),
color=color,
ha="center",
va="bottom",
transform=matplotlib.transforms.offset_copy(ax.transData,fig,
x=0.0,y=3.0,units="points")
)
plt.xlim(xmin,xmax)
plt.ylim(-0.1,plt_incr+baseline)
ax.yaxis.set_ticks([])
# save data as p-site offset table
fn = "%s_p_offsets.txt" % outbase
fout = argsopener(fn,args)
printer.write("Writing offset table to %s ..." % fn)
fout.write("length\tp_offset\n")
for k in offset_dict:
fout.write("%s\t%s\n" % (k,offset_dict[k]))
fout.write("default\t%s" % args.default)
fout.close()
# save plot
plot_fn ="%s_p_offsets.%s" % (outbase,args.figformat)
printer.write("Saving plot to %s ..." % plot_fn)
plt.savefig(plot_fn,dpi=args.dpi,bbox_inches="tight")
printer.write("Done.")
示例#4文件:
get_count_vectors.py项目:
joshuagryphon/plastid
def main(args=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:func:`main` is called directly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
al = AlignmentParser()
an = AnnotationParser()
mp = MaskParser()
bp = BaseParser()
alignment_file_parser = al.get_parser(conflict_handler="resolve")
annotation_file_parser = an.get_parser(conflict_handler="resolve")
mask_file_parser = mp.get_parser()
base_parser = bp.get_parser()
parser = argparse.ArgumentParser(
description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
conflict_handler="resolve",
parents=[base_parser, alignment_file_parser, annotation_file_parser, mask_file_parser],
)
parser.add_argument("out_folder", type=str, help="Folder in which to save output vectors")
parser.add_argument(
"--out_prefix", default="", type=str, help="Prefix to prepend to output files (default: no prefix)"
)
parser.add_argument(
"--format", default="%.8f", type=str, help=r"printf-style format string for output (default: '%%.8f')"
)
args = parser.parse_args(args)
bp.get_base_ops_from_args(args)
# if output folder doesn't exist, create it
if not os.path.isdir(args.out_folder):
os.mkdir(args.out_folder)
# parse args
ga = al.get_genome_array_from_args(args, printer=printer)
transcripts = an.get_segmentchains_from_args(args, printer=printer)
mask_hash = mp.get_genome_hash_from_args(args, printer=printer)
# evaluate
for n, tx in enumerate(transcripts):
if n % 1000 == 0:
printer.write("Processed %s regions of interest" % n)
filename = "%s%s.txt" % (args.out_prefix, tx.get_name())
full_filename = os.path.join(args.out_folder, filename)
# mask out overlapping masked regions
overlapping = mask_hash.get_overlapping_features(tx)
for feature in overlapping:
tx.add_masks(*feature.segments)
count_vec = tx.get_masked_counts(ga)
numpy.savetxt(full_filename, count_vec, fmt=args.format)
示例#5文件:
phase_by_size.py项目:
joshuagryphon/plastid
def main(argv=sys.argv[1:]):
"""Command-line program
Parameters
----------
argv : list, optional
A list of command-line arguments, which will be processed
as if the script were called from the command line if
:py:func:`main` is called directly.
Default: `sys.argv[1:]`. The command-line arguments, if the script is
invoked from the command line
"""
al = AlignmentParser(disabled=["normalize","big_genome","spliced_bowtie_files"],
input_choices=["BAM"])
an = AnnotationParser()
pp = PlottingParser()
bp = BaseParser()
plotting_parser = pp.get_parser()
alignment_file_parser = al.get_parser(conflict_handler="resolve")
annotation_file_parser = an.get_parser(conflict_handler="resolve")
base_parser = bp.get_parser()
parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
formatter_class=argparse.RawDescriptionHelpFormatter,
conflict_handler="resolve",
parents=[base_parser,
annotation_file_parser,
alignment_file_parser,
plotting_parser])
parser.add_argument("roi_file",type=str,nargs="?",default=None,
help="Optional. ROI file of maximal spanning windows surrounding start codons, "+\
"from ``metagene generate`` subprogram. Using this instead of `--annotation_files` "+\
"prevents double-counting of codons when multiple transcript isoforms exist "+\
"for a gene. See the documentation for `metagene` for more info about ROI files."+\
"If an ROI file is not given, supply an annotation with ``--annotation_files``")
parser.add_argument("outbase",type=str,help="Required. Basename for output files")
parser.add_argument("--codon_buffer",type=int,default=5,
help="Codons before and after start codon to ignore (Default: 5)")
args = parser.parse_args(argv)
bp.get_base_ops_from_args(args)
pp.set_style_from_args(args)
gnd = al.get_genome_array_from_args(args,printer=printer)
read_lengths = list(range(args.min_length,args.max_length+1))
codon_buffer = args.codon_buffer
dtmp = { "read_length" : numpy.array(read_lengths),
"reads_counted" : numpy.zeros_like(read_lengths,dtype=int),
}
if args.roi_file is not None:
using_roi = True
roi_table = read_pl_table(args.roi_file)
regions = roi_table.iterrows()
transform_fn = roi_row_to_cds
back_buffer = -1
if len(args.annotation_files) > 0:
warnings.warn("If an ROI file is given, annotation files are ignored. Pulling regions from '%s'. Ignoring '%s'" % (args.roi_file,
", ".join(args.annotation_files)),
ArgumentWarning)
else:
using_roi = False
if len(args.annotation_files) == 0:
printer.write("Either an ROI file or at least annotation file must be given.")
sys.exit(1)
else:
warnings.warn("Using a transcript annotation file instead of an ROI file can lead to double-counting of codons if the annotation contains multiple transcripts per gene.",
ArgumentWarning)
regions = an.get_transcripts_from_args(args,printer=printer)
back_buffer = -codon_buffer
transform_fn = lambda x: x.get_cds()
phase_sums = {}
for k in read_lengths:
phase_sums[k] = numpy.zeros(3)
for n, roi in enumerate(regions):
if n % 1000 == 1:
printer.write("Counted %s ROIs ..." % n)
# transformation needed to extract CDS from transcript or from ROI file window
cds_part = transform_fn(roi)
# only calculate for coding genes
if len(cds_part) > 0:
read_dict = {}
count_vectors = {}
for k in read_lengths:
read_dict[k] = []
count_vectors[k] = []
# for each seg, fetch reads, sort them, and create individual count vectors
for seg in cds_part:
reads = gnd.get_reads(seg)
for read in filter(lambda x: len(x.positions) in read_dict,reads):
read_dict[len(read.positions)].append(read)
# map and sort by length
for read_length in read_dict:
count_vector = list(gnd.map_fn(read_dict[read_length],seg)[1])
count_vectors[read_length].extend(count_vector)
# add each count vector for each length to total
for k, vec in count_vectors.items():
counts = numpy.array(vec)
if cds_part.strand == "-":
counts = counts[::-1]
if len(counts) % 3 == 0:
counts = counts.reshape((len(counts)/3,3))
else:
if using_roi == False:
message = "Length of '%s' coding region (%s nt) is not divisible by 3. Ignoring last partial codon." % (roi.get_name(),len(counts))
warnings.warn(message,DataWarning)
newlen = len(counts)//3
counts = counts[:3*newlen]
counts = counts.reshape(newlen,3)
phase_sums[k] += counts[codon_buffer:back_buffer,:].sum(0)
printer.write("Counted %s ROIs total." % (n+1))
for k in dtmp:
dtmp[k] = numpy.array(dtmp[k])
# total reads counted for each size
for k in read_lengths:
dtmp["reads_counted"][dtmp["read_length"] == k] = phase_sums[k].sum()
# read length distribution
dtmp["fraction_reads_counted"] = dtmp["reads_counted"].astype(float) / dtmp["reads_counted"].sum()
# phase vectors
phase_vectors = { K : V.astype(float)/V.astype(float).sum() for K,V in phase_sums.items() }
for i in range(3):
dtmp["phase%s" % i] = numpy.zeros(len(dtmp["read_length"]))
for k, vec in phase_vectors.items():
for i in range(3):
dtmp["phase%s" % i][dtmp["read_length"] == k] = vec[i]
# phase table
fn = "%s_phasing.txt" % args.outbase
printer.write("Saving phasing table to %s ..." % fn)
dtmp = pd.DataFrame(dtmp)
with argsopener(fn,args) as fh:
dtmp.to_csv(fh,columns=["read_length",
"reads_counted",
"fraction_reads_counted",
"phase0",
"phase1",
"phase2",
],
float_format="%.6f",
na_rep="nan",
sep="\t",
index=False,
header=True
)
fh.close()
fig = {}
if args.figsize is not None:
fig["figsize"] = tuple(args.figsize)
colors = pp.get_colors_from_args(args,len(read_lengths))
fn = "%s_phasing.%s" % (args.outbase,args.figformat)
printer.write("Plotting to %s ..." % fn)
plot_counts = numpy.vstack([V for (_,V) in sorted(phase_sums.items())])
fig, (ax1,_) = phase_plot(plot_counts,labels=read_lengths,lighten_by=0.3,
cmap=None,color=colors,fig=fig)
if args.title is not None:
ax1.set_title(args.title)
else:
ax1.set_title("Phasing stats for %s" % args.outbase)
fig.savefig(fn,dpi=args.dpi,bbox_inches="tight")