#!/usr/bin/env python # # Copyright (c) 2016 10X Genomics, Inc. All rights reserved. # """ Reference preparation tool for 10x Genomics {product}. Build a {product}-compatible reference folder from user-supplied genome FASTA and gene GTF files. Creates a new folder named after the genome. The commands below should be preceded by '{cmd}': Usage: mkref --genome=NAME ... --fasta=PATH ... --genes=PATH ... [options] mkref -h | --help | --version Arguments: genome Unique genome name(s), used to name output folder [a-zA-Z0-9_-]+. Specify multiple genomes by specifying the --genome argument multiple times; the output folder will be _and_. fasta Path(s) to FASTA file containing your genome reference. Specify multiple genomes by specifying the --fasta argument multiple times. genes Path(s) to genes GTF file(S) containing annotated genes for your genome reference. Specify multiple genomes by specifying the --genes argument multiple times. Options: --nthreads= Number of threads used during STAR genome index generation. Defaults to 1. --memgb= Maximum memory (GB) used when aligning reads with STAR. Defaults to 16. --ref-version= Optional reference version string to include with reference. -h --help Show this message. --version Show version. """ from __future__ import absolute_import, print_function import os import sys import docopt import cellranger.reference_builder as cr_ref import cellranger.io as cr_io from cellranger.products import get_cmd_names def _parse_args(product_name): version = "%s %s %s\n%s" % ( product_name, os.getenv("TENX_SUBCMD", ""), os.getenv("TENX_VERSION", ""), os.getenv("TENX_COPYRIGHT", ""), ) product, cmd = get_cmd_names(product_name) return docopt.docopt(__doc__.format(product=product, cmd=cmd), version=version) def main(): print(sys.argv) sys.stdout.flush() args = _parse_args(os.getenv("TENX_PRODUCT", "")) genomes = args["--genome"] input_fasta_files = cr_io.get_input_paths(args["--fasta"]) input_genes_files = cr_io.get_input_paths(args["--genes"]) num_threads = args["--nthreads"] mem_gb = args["--memgb"] output_base_dir = "_and_".join(genomes) output_dir = os.path.join(os.getcwd(), output_base_dir) ref_version = args["--ref-version"] mkref_version = "{}-{}".format(os.getenv("TENX_PRODUCT", ""), os.getenv("TENX_VERSION", "")) if num_threads is None: num_threads = 1 elif num_threads.isdigit() and int(num_threads) > 0: num_threads = int(num_threads) else: sys.exit("--nthreads must be a positive integer") if mem_gb is None: mem_gb = 16 elif mem_gb.isdigit() and int(mem_gb) > 0: mem_gb = int(mem_gb) else: sys.exit("--memgb must be a positive integer") if len(genomes) != len(input_fasta_files) or len(genomes) != len(input_genes_files): sys.exit("Please provide the same number of genome names, genome FASTA and gene GTF files") for genome in genomes: if "/" in genome: sys.exit( "The specified genome name '%s' contains a '/' character, which is not allowed. --genome must specify a name for the reference, not a path." % genome ) # Check that destination folder doesn't already exist if os.path.exists(output_dir): sys.exit( "Destination reference folder already exists: %s\nPlease delete and start again." % output_dir ) # Check for write permissions in output directory output_parent_dir = os.path.dirname(output_dir) if not os.access(output_parent_dir, os.W_OK): sys.exit("You do not have write permission in %s." % output_parent_dir) try: reference_builder = cr_ref.ReferenceBuilder( genomes, input_fasta_files, input_genes_files, output_dir, ref_version, mkref_version, num_threads=num_threads, mem_gb=mem_gb, ) reference_builder.build_gex_reference() except (cr_ref.GtfParseError, cr_ref.GexReferenceError) as ex: sys.exit("mkref has failed: error building reference package\n{}".format(ex)) print(">>> Reference successfully created! <<<\n") print("You can now specify this reference on the command line:") print("cellranger --transcriptome=%s ..." % output_dir) if __name__ == "__main__": main()