Data sourceΒΆ

Define a composed data source from multiple files in several folders.

# A glob file pattern to match all the files to annotate
pattern = "*.maf.xz"

# 'True' if these annotations apply recursively to all subdirectories
# 'False' if only applies to current folder
recursive = False

# Without annotations bgparsers will try to find only this columns into
# all the data sources:
#  - SAMPLE
#  - DONOR
#  - CHROMOSOME
#  - POSITION
#  - REF
#  - ALT
#  - STRAND
#  - ALT_TYPE
#
# If you want to parse more columns you've to add an 'annotations' section
# defining them. Notice, that you can override this columns if you want.
#
[annotations]

# This is a "static" annotation, that means that it will be added to all the
# rows of all the files that match this 'bginfo' pattern.
#
# Syntax: <annotation> = <string>
PLATFORM=WXS

# This is a "view" annotation that creates a new annotation using other row
# annotations and values. You can use python format syntax:
# https://docs.python.org/3/library/string.html#formatspec
#
# Syntax: <annotation> = <string with python format syntax>
COORDINATE="chr{CHROMOSOME}:{POSITION}:{STRAND}"

# These are annotations (others than the standard core annotations that bgparsers
# can automatically detect) that are inside the data files and we want to extract
# from them. If it's a core annotation this definition will prevail over any
# automatically detection
#
# Syntax: <annotation> = "('internal', '<file column name>')"
DONOR = "('internal', 'SAMPLE')"

# It's also possible to use multiple columns to create a new one using python
# string format syntax referenced by keyword.
# https://docs.python.org/3/library/string.html#formatstrings
#
# Syntax: <annotation> = "('internal', ('<python format like string>', <list of file column names to use>))"
SAMPLE = "('internal', ('{icgc_sample_id}_{icgc_specimen_id}', ['icgc_sample_id', 'icgc_specimen_id']))"

# A mapping annotation allows to use a mapping file to add a new column base on
# a many to one relation.
#
# The mapping file is a tab separated dataset with headers and at least two columns.
#
# Syntax: <annotation> = "('mapping', '<dataset key column>', '<mapping file>', '<mapping key column>', '<mapping value column>')"
DONOR = "('mapping', 'SAMPLE', 'mapping_samples_selected.txt', 'SAMPLE_ID', 'PATIENT_ID')"

# An annotation base on the file folder name.
#
# Syntax: <annotation> = "('dirname', '<regular expression>', <optional postprocessing python function>)"
DATASET = "('dirname', '(.*)', lambda v: '{{PLATFORM}}_{}'.format(v.upper()))"

# An annotation base on the file name
#
# Syntax: <annotation> = "('filename', '<regular expression>', <optional postprocessing python function>)"
GENOMEREF = "('filename', '([^.]+)\\.maf\\.xz', lambda v: "hg{}".format(v.upper()))"

# Run a LiftOver of the genomic coordinates
#
# Syntax: <annotation> = "('liftover', <from version>, <to version>)
POSITION_HG38 = "('liftover', '{GENOMEREF}', 'HG38')"

# Optionally you can add a fourth argument to format the output of the LiftOver (using python format syntax)
# (If no format is declared then only the position is returned)
POSITION_HG19 = "('liftover', '{GENOMEREF}', 'HG19', 'chr{CHROMOSOME}:{POSITION}:{STRAND}')"

# The excludes section allows to filter out all the rows that have specific values in an annotation.
#
[excludes]

# Syntax: <annotation> = "<value to filter out>"
DONOR = "EXCLUDED_DONOR"