conf/test-data.config

 params {
    /*
    Dataset definitions - these are assemblies for which we want to generate pretzel-compatible JSON files. 
    
    Fields:
      species (required)   - no spaces, stick to alpphanumeric and underscores
      version (required)   - version of the assembly 
      shortName (optional) - name displayed in pretzel over the chromosome axis

    The combination of species+version must be unique among input assemblies.

    In addition, to generate different data set types you will need one or more of the following:
  
      1. For generation of protein-alignmant-based aliases which are most useful for interspecies comparions
          idx - a path or URL to a genome index (fai) file, 
                or, really just sequence identifiers and their lengths 
                if idx not defined then 'fasta' field must be defined (see below)
          pep - a path or URL to a set of proteins FASTA
                if protein definition lines are formatted as pep files from Ensembl genomes
                this is enough, otherwise you will also need 
          gff3 - a gff3 file describing the gene predictions, compatible with content of pep and the underlying genome assembly 
  
      2. To be able to place a set of sequences on an assembly, its definition should specify
          fasta - a path or URL to a genome fasta file
          you will also need to define sequences you want to place, see below comments for 'sequencesToPlace'


    It is recommended to include these (optional) additional fields to capture the origin of your data sets
      source   
      citation 
    
    Furthermore, if chromosome ids in your dataset do not match /^(ch|[0-9]|x|y|i|v)/  
    you may specify the following optional field 
      allowedIdPattern - which could be a regular expression matching your chromosome/supercontig naming pattern 

    */
    references = [
      [
        species : "Encephalitozoon_intestinalis_ATCC_50506",
        version : "gca_000146465",
        shortName : "E. intestinalis", //arbitrary display name
        pep : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.pep.all.fa.gz",
        // gff3 : not required as Ensembl-style pep provided // gff3 : "ftp://ftp.ensemblgenomes.org/pub/fungi/release-45/gff3/fungi_microsporidia1_collection/encephalitozoon_intestinalis_atcc_50506_gca_000146465/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.45.gff3.gz",
        idx :   "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz.fai",
        fasta : "testdata/Encephalitozoon_intestinalis_atcc_50506_gca_000146465.ASM14646v1.dna.toplevel.fa.gz",
        source: "https://fungi.ensembl.org/Encephalitozoon_intestinalis_atcc_50506_gca_000146465"
      ],
      [
        species : "Encephalitozoon_cuniculi_ecuniii_l",
        version : "gca_001078035",
        shortName : "E. cuniculi L", //arbitrary display name
        pep : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.pep.all.fa.gz",
        //gff3 : (not needed with Ensembl-style pep but including here for testing purposes)
        gff3 : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.45.gff3.gz",
        idx : "testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.dna.toplevel.fa.gz.fai",
        source: "https://fungi.ensembl.org/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035"
      ],
      [
        species : "Encephalitozoon_hellem_ATCC_50504",
        version : "2014-10-01",
        shortName : "E. hellem", //arbitrary display name
        pep : "testdata/MicrosporidiaDB-46_EhellemATCC50504_AnnotatedProteins.fasta.gz",
        gff3 : "testdata/MicrosporidiaDB-46_EhellemATCC50504.gff.gz",
        fasta : "testdata/MicrosporidiaDB-46_EhellemATCC50504_Genome.fasta.gz",
        allowedIdPattern : '^CP0027.*', //Must specify chromosome ID prefix if chromosome naming other than /^(ch|[0-9]|x|y|i|v)/  
        source: "https://microsporidiadb.org/micro/app/record/organism/NCBITAXON_907965"
      ],
      [ //Data set without pep specified so will not be used for alias generation, but genome FASTA provided so can be used to place marker/other sequences on
        species : "Encephalitozoon_cuniculi_EC2",
        version : "GCA_000221265.2",
        shortName : "E cuniculi EC2", //arbitrary display name      
        fasta : "testdata/GCA_000221265.2_Ence_cuni_EC2_V1_genomic.fna.gz",
        allowedIdPattern : '^AEWQ010000.*', 
        source: "https://www.ncbi.nlm.nih.gov/assembly/GCA_000221265.2"
      ]
    ]

    /*
    Markers, contigs scaffolds, gene predictions to be placed on all or a subset of assemblies
    Fileds:
      name
      fasta
      seqtype - can be one of markers|transcripts|cds|genomic
      target: [ [species: '', version: ''], [species: '', version: ''] ] //all data sets if this optional field not specified
   */
    sequencesToPlace = [
      [
        name: 'E_cuniculi', 
        fasta: 'testdata/Encephalitozoon_cuniculi_ecuniii_l_gca_001078035.ECIIIL.cds.all.fa.gz',  //local or remote, either gz or not
        seqtype: 'cds' //markers|transcripts|cds|genomic        
      ]
    ]
 }