Výpočetní metody v bioinformatice a systémové biologii

Týden 3 - UCSC a Ensembl Genome Browser - programovatelne dotazy a zber dat

  Jednoduchý způsob interakce se službou Biomart bez potřeby R je přes http protokol:

curl -d@query.xml http://www.biomart.org/biomart/martservice/result

  kde query.xml obsahuje dotaz (lze vygenerovat i na webu biomartu), např.:

query=xml version="1.0" encoding="UTF-8"?>
DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >        
   
<Dataset name = "hsapiens_gene_ensembl" interface = "default" >
        <Filter name = "chromosome_name" value = "1"/>
        <Filter name = "with_illumina_humanwg_6_v3" excluded = "0"/>
        <Attribute name = "ensembl_gene_id" />
        <Attribute name = "ensembl_transcript_id" />
    </Dataset>
Query>

Hledani atributu (nebo filtru):

listAttributes(ensembl)[grep("go",listAttributes(ensembl)$name),]

Vzorové řešení úkolu 1:

library(biomaRt)
mymart = useMart("ensembl",dataset="hsapiens_gene_ensembl")

# vytvor tabulku s relevantnimi daty
g <- getBM(attributes=c('ensembl_gene_id', 'ensembl_transcript_id', 'chromosome_name', 'strand', 'exon_chrom_start', 'exon_chrom_end'), filters = 'with_exon_transcript', values = TRUE, mart = mymart)
tt <- table(factor(g[,2]))
mg <- merge(g,data.frame(tt), by.x="ensembl_transcript_id",by.y="Var1")

# seznam transkriptu s 2 exony (druhy zpusob neni spravny?)
t2 <- names(tt[(which(tt == 2))])
nt2 <- length(t2)
nt2
[1] 32496
mg3 <- mg[which(mg$Freq == 3),]
length(mg3[,1])/3
[1] 28550

# zacatky a konce 1 intronu
i1 <- (1:28550)*3-2
i2 <- (1:28550)*3-1
i3 <- (1:28550)*3
i1beg <- mg3[i1,6] - mg3[i1,5]
i1end <- pmin(abs(mg3[i2,5] - mg3[i1,5]),abs(mg3[i2,6] - mg3[i1,5]))
i2beg <- abs(mg3[i2,6] - mg3[i1,5])
i2end <- pmin(abs(mg3[i3,5] - mg3[i1,5]),abs(mg3[i3,6] - mg3[i1,5]))
tlen <- pmax(abs(mg3[i3,5] - mg3[i1,5]),abs(mg3[i3,6] - mg3[i1,5]))
ri1beg <- i1beg/tlen
ri1end <- i1end/tlen
ri2beg <- i2beg/tlen
ri2end <- i2end/tlen

# vykresli grafy
hist(i1end-i1beg, breaks=50)
hist(i2end-i2beg, breaks=80)
plot(c(0,summary(ri1beg)[4],summary(ri1end)[4],summary(ri2beg)[4],summary(ri2end)[4],1))