Data sourceΒΆ
Define a composed data source from multiple files in several folders.
# A glob file pattern to match all the files to annotate
pattern = "*.maf.xz"
# 'True' if these annotations apply recursively to all subdirectories
# 'False' if only applies to current folder
recursive = False
# Without annotations bgparsers will try to find only this columns into
# all the data sources:
# - SAMPLE
# - DONOR
# - CHROMOSOME
# - POSITION
# - REF
# - ALT
# - STRAND
# - ALT_TYPE
#
# If you want to parse more columns you've to add an 'annotations' section
# defining them. Notice, that you can override this columns if you want.
#
[annotations]
# This is a "static" annotation, that means that it will be added to all the
# rows of all the files that match this 'bginfo' pattern.
#
# Syntax: <annotation> = <string>
PLATFORM=WXS
# This is a "view" annotation that creates a new annotation using other row
# annotations and values. You can use python format syntax:
# https://docs.python.org/3/library/string.html#formatspec
#
# Syntax: <annotation> = <string with python format syntax>
COORDINATE="chr{CHROMOSOME}:{POSITION}:{STRAND}"
# These are annotations (others than the standard core annotations that bgparsers
# can automatically detect) that are inside the data files and we want to extract
# from them. If it's a core annotation this definition will prevail over any
# automatically detection
#
# Syntax: <annotation> = "('internal', '<file column name>')"
DONOR = "('internal', 'SAMPLE')"
# It's also possible to use multiple columns to create a new one using python
# string format syntax referenced by keyword.
# https://docs.python.org/3/library/string.html#formatstrings
#
# Syntax: <annotation> = "('internal', ('<python format like string>', <list of file column names to use>))"
SAMPLE = "('internal', ('{icgc_sample_id}_{icgc_specimen_id}', ['icgc_sample_id', 'icgc_specimen_id']))"
# A mapping annotation allows to use a mapping file to add a new column base on
# a many to one relation.
#
# The mapping file is a tab separated dataset with headers and at least two columns.
#
# Syntax: <annotation> = "('mapping', '<dataset key column>', '<mapping file>', '<mapping key column>', '<mapping value column>')"
DONOR = "('mapping', 'SAMPLE', 'mapping_samples_selected.txt', 'SAMPLE_ID', 'PATIENT_ID')"
# An annotation base on the file folder name.
#
# Syntax: <annotation> = "('dirname', '<regular expression>', <optional postprocessing python function>)"
DATASET = "('dirname', '(.*)', lambda v: '{{PLATFORM}}_{}'.format(v.upper()))"
# An annotation base on the file name
#
# Syntax: <annotation> = "('filename', '<regular expression>', <optional postprocessing python function>)"
GENOMEREF = "('filename', '([^.]+)\\.maf\\.xz', lambda v: "hg{}".format(v.upper()))"
# Run a LiftOver of the genomic coordinates
#
# Syntax: <annotation> = "('liftover', <from version>, <to version>)
POSITION_HG38 = "('liftover', '{GENOMEREF}', 'HG38')"
# Optionally you can add a fourth argument to format the output of the LiftOver (using python format syntax)
# (If no format is declared then only the position is returned)
POSITION_HG19 = "('liftover', '{GENOMEREF}', 'HG19', 'chr{CHROMOSOME}:{POSITION}:{STRAND}')"
# The excludes section allows to filter out all the rows that have specific values in an annotation.
#
[excludes]
# Syntax: <annotation> = "<value to filter out>"
DONOR = "EXCLUDED_DONOR"