Pbase
example dataPackage: Pbase
Authors: Laurent Gatto and
Sebastian Gibb
Last compiled: Fri Jan 4 20:14:25 2019
Last modified: 2018-10-30 15:44:45
This vignette briefly introduces the central data object of the
Pbase
package, namely Proteins
instances, as depicted below. They
contain a set of protein sequences (10 in the figure below), composed
of the protein sequences (grey boxes) and annotation data (table on
the left). Each protein links to a set of ranges of interest, such as
protein domains of experimentally observed peptides (also in grey)
that are also decorated with their own annotation data. The figure
also show the accessors for the different data slots, that are
detailed in ?Proteins
.
Proteins
objects are populated by protein sequences stemming from a
fasta file and the peptides typically originate from an LC-MSMS
experiment.
The original data used below is a 10 fmol
Peptide Retention Time Calibration Mixture
spiked into 50 ng HeLa background acquired on a Thermo Orbitrap Q
Exactive instrument. A restricted set of high scoring human proteins
from the UniProt release 2015_02
were searched using the MSGF+
search engine.
library("Biostrings")
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
fafile <- system.file("extdata/HUMAN_2015_02_selected.fasta",
package = "Pbase")
fa <- readAAStringSet(fafile)
fa
## A AAStringSet instance of length 9
## width seq names
## [1] 2602 MPVTEKDLAEDAPWKKIQQNTF...VLAVKWGEEHIPGSPFHVTVP sp|O75369|FLNB_HU...
## [2] 3374 MSPESGHSRIFEATAGPNKPES...YTLSKDSLSNGVPSGRQAEFS sp|A4UGR9|XIRP2_H...
## [3] 2624 MFRRARLSVKPNVRPGVGARGS...EATTVSEYFFNDIFIEVDETE sp|A6H8Y1|BDP1_HU...
## [4] 911 MVDYHAANQSYQYGPSSAGNGA...AVPGALDYKSFSTALYGESDL sp|O43707|ACTN4_H...
## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI sp|P00558|PGK1_HU...
## [6] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF sp|P60709|ACTB_HU...
## [7] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM sp|P02545|LMNA_HU...
## [8] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY sp|P04075|ALDOA_H...
## [9] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY sp|P04075-2|ALDOA...
library("mzID")
idfile <- system.file("extdata/Thermo_Hela_PRTC_selected.mzid",
package = "Pbase")
id <- flatten(mzID(idfile))
## reading Thermo_Hela_PRTC_selected.mzid... DONE!
dim(id)
## [1] 137 29
head(id)
## spectrumid scan number(s)
## 1 index=173 12256
## 1.1 index=173 12256
## 2 index=163 11860
## 2.1 index=163 11860
## 3 index=200 13408
## 3.1 index=200 13408
## spectrum title
## 1 msLevel 2; retentionTime 2094.56706; scanNum 12256; precMz 1137.06665029649; precCharge 2
## 1.1 msLevel 2; retentionTime 2094.56706; scanNum 12256; precMz 1137.06665029649; precCharge 2
## 2 msLevel 2; retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2
## 2.1 msLevel 2; retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2
## 3 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3
## 3.1 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3
## acquisitionnum passthreshold rank calculatedmasstocharge
## 1 173 TRUE 1 1136.574
## 1.1 173 TRUE 1 1136.574
## 2 163 TRUE 1 1136.574
## 2.1 163 TRUE 1 1136.574
## 3 200 TRUE 1 703.037
## 3.1 200 TRUE 1 703.037
## experimentalmasstocharge chargestate ms-gf:denovoscore ms-gf:evalue
## 1 1137.0667 2 132 2.597097e-18
## 1.1 1137.0667 2 132 2.597097e-18
## 2 1136.5745 2 230 4.942664e-17
## 2.1 1136.5745 2 230 4.942664e-17
## 3 703.0381 3 145 4.080429e-10
## 3.1 703.0381 3 145 4.080429e-10
## ms-gf:rawscore ms-gf:specevalue assumeddissociationmethod isotopeerror
## 1 118 2.276758e-22 CID 1
## 1.1 118 2.276758e-22 CID 1
## 2 186 4.333009e-21 CID 0
## 2.1 186 4.333009e-21 CID 0
## 3 98 3.578068e-14 CID 0
## 3.1 98 3.578068e-14 CID 0
## isdecoy post pre end start accession length
## 1 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364
## 1.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418
## 2 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364
## 2.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418
## 3 FALSE Y K 173 154 sp|P04075|ALDOA_HUMAN 364
## 3.1 FALSE Y K 227 208 sp|P04075-2|ALDOA_HUMAN 418
## description
## 1 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2
## 1.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA
## 2 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2
## 2.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA
## 3 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2
## 3.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA
## pepseq modified modification
## 1 GVVPLAGTNGETTTQGLDGLSER FALSE <NA>
## 1.1 GVVPLAGTNGETTTQGLDGLSER FALSE <NA>
## 2 GVVPLAGTNGETTTQGLDGLSER FALSE <NA>
## 2.1 GVVPLAGTNGETTTQGLDGLSER FALSE <NA>
## 3 IGEHTPSALAIMENANVLAR FALSE <NA>
## 3.1 IGEHTPSALAIMENANVLAR FALSE <NA>
## idFile spectrumFile
## 1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## 1.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## 2 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## 2.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## 3 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## 3.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf
## databaseFile
## 1 HUMAN_2015_02_selected.fasta
## 1.1 HUMAN_2015_02_selected.fasta
## 2 HUMAN_2015_02_selected.fasta
## 2.1 HUMAN_2015_02_selected.fasta
## 3 HUMAN_2015_02_selected.fasta
## 3.1 HUMAN_2015_02_selected.fasta
library("Pbase")
p <- Proteins(fafile)
p <- addIdentificationData(p, idfile)
## Reading 1 identification files:
## 1. /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/Thermo_Hela_PRTC_selected.mzid
## done.
p
## S4 class type : Proteins
## Class version : 0.2
## Created : Fri Jan 4 20:14:42 2019
## Number of Proteins: 9
## Sequences:
## [1] A4UGR9 [2] A6H8Y1 ... [8] P04075-2 [9] P60709
## Protein ranges:
## Peptides
A Proteins
object is composed of a set of protein sequences
accessible with the aa
accessor as well as an optional set of
peptides features that are mapped as coordinates along the proteins,
available with pranges
. The actual peptide sequences can be extraced
with pfeatures
. The names of the protein sequences can be extraced
with seqnames
.
aa(p)
## A AAStringSet instance of length 9
## width seq names
## [1] 3374 MSPESGHSRIFEATAGPNKPES...YTLSKDSLSNGVPSGRQAEFS A4UGR9
## [2] 2624 MFRRARLSVKPNVRPGVGARGS...EATTVSEYFFNDIFIEVDETE A6H8Y1
## [3] 911 MVDYHAANQSYQYGPSSAGNGA...AVPGALDYKSFSTALYGESDL O43707
## [4] 2602 MPVTEKDLAEDAPWKKIQQNTF...VLAVKWGEEHIPGSPFHVTVP O75369
## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI P00558
## [6] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM P02545
## [7] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY P04075
## [8] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY P04075-2
## [9] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF P60709
seqnames(p)
## [1] "A4UGR9" "A6H8Y1" "O43707" "O75369" "P00558" "P02545"
## [7] "P04075" "P04075-2" "P60709"
pranges(p)
## DataFrame with 9 rows and 1 column
## Peptides
## <IRangesList>
## A4UGR9 2743-2760,307-318,1858-1870,...
## A6H8Y1 448-465,1284-1291,1120-1128,...
## O43707 51-65,495-512,438-450,...
## O75369 895-909,1746-1757,2563-2578,...
## P00558 193-206,268-275,333-350,...
## P02545 1-11,332-349,367-377,...
## P04075 112-134,112-134,154-173,...
## P04075-2 166-188,166-188,208-227,...
## P60709 184-196
pfeatures(p)
## AAStringSetList of length 9
## [["A4UGR9"]] A4UGR9=QEITQNKSFFSSVKESQR ... A4UGR9=QEITQNKSFFSSVK
## [["A6H8Y1"]] A6H8Y1=EDAEQVALEVDLNQKKRR ...
## [["O43707"]] O43707=QQRKTFTAWCNSHLR ... O43707=VGWEQLLTTIAR
## [["O75369"]] O75369=DLDIIDNYDYSHTVK ... O75369=VQAQGPGLKEAFTNK
## [["P00558"]] P00558=ELNYFAKALESPER P00558=DLMSKAEK ... P00558=GTKALMDEVVK
## [["P02545"]] P02545=METPSQRRATR ... P02545=RATRSGAQASSTPLSPTR
## [["P04075"]] P04075=GVVPLAGTNGETTTQGLDGLSER ...
## [["P04075-2"]] P04075-2=GVVPLAGTNGETTTQGLDGLSER ...
## [["P60709"]] P60709=DLTDYLMKILTER
A Proteins instance is further described by general metadata
list. Protein sequence and peptide features annotations can be
accessed with acols
and pcols
respectively, which return
DataFrame
instances.
metadata(p)
## $created
## [1] "Fri Jan 4 20:14:42 2019"
acols(p)
## DataFrame with 9 rows and 12 columns
## DB AccessionNumber EntryName IsoformName
## <Rle> <character> <character> <Rle>
## A4UGR9 sp A4UGR9 XIRP2_HUMAN NA
## A6H8Y1 sp A6H8Y1 BDP1_HUMAN NA
## O43707 sp O43707 ACTN4_HUMAN NA
## O75369 sp O75369 FLNB_HUMAN NA
## P00558 sp P00558 PGK1_HUMAN NA
## P02545 sp P02545 LMNA_HUMAN NA
## P04075 sp P04075 ALDOA_HUMAN NA
## P04075-2 sp P04075-2 ALDOA_HUMAN 2
## P60709 sp P60709 ACTB_HUMAN NA
## ProteinName OrganismName
## <character> <Rle>
## A4UGR9 Xin actin-binding repeat-containing protein 2 Homo sapiens
## A6H8Y1 Transcription factor TFIIIB component B'' homolog Homo sapiens
## O43707 Alpha-actinin-4 Homo sapiens
## O75369 Filamin-B Homo sapiens
## P00558 Phosphoglycerate kinase 1 Homo sapiens
## P02545 Prelamin-A/C Homo sapiens
## P04075 Fructose-bisphosphate aldolase A Homo sapiens
## P04075-2 Fructose-bisphosphate aldolase A Homo sapiens
## P60709 Actin, cytoplasmic 1 Homo sapiens
## GeneName ProteinExistence SequenceVersion Comment
## <Rle> <Rle> <Rle> <Rle>
## A4UGR9 XIRP2 Evidence at protein level 2 NA
## A6H8Y1 BDP1 Evidence at protein level 3 NA
## O43707 ACTN4 Evidence at protein level 2 NA
## O75369 FLNB Evidence at protein level 2 NA
## P00558 PGK1 Evidence at protein level 3 NA
## P02545 LMNA Evidence at protein level 1 NA
## P04075 ALDOA Evidence at protein level 2 NA
## P04075-2 ALDOA NA NA NA
## P60709 ACTB Evidence at protein level 1 NA
## Filename
## <Rle>
## A4UGR9 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## A6H8Y1 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## O43707 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## O75369 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## P00558 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## P02545 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## P04075 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## P04075-2 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## P60709 /tmp/RtmpSKeaxK/Rinst77cc785ede1d/Pbase/extdata/HUMAN_2015_02_selected.fasta
## npeps
## <integer>
## A4UGR9 36
## A6H8Y1 23
## O43707 6
## O75369 13
## P00558 5
## P02545 12
## P04075 21
## P04075-2 20
## P60709 1
pcols(p)
## DataFrame with 9 rows and 1 column
## Peptides
## <IRangesList>
## A4UGR9 2743-2760,307-318,1858-1870,...
## A6H8Y1 448-465,1284-1291,1120-1128,...
## O43707 51-65,495-512,438-450,...
## O75369 895-909,1746-1757,2563-2578,...
## P00558 193-206,268-275,333-350,...
## P02545 1-11,332-349,367-377,...
## P04075 112-134,112-134,154-173,...
## P04075-2 166-188,166-188,208-227,...
## P60709 184-196
Specific proteins can be extracted by index of name using
[
and proteins and their peptide features can be plotted
with the default plot method.
seqnames(p)
## [1] "A4UGR9" "A6H8Y1" "O43707" "O75369" "P00558" "P02545"
## [7] "P04075" "P04075-2" "P60709"
plot(p[c(1,9)])
More details can be found in ?Proteins
. The object generated above
is also directly available as data(p)
.
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.5 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.8-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.8-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] mzID_1.20.1 Biostrings_2.50.2 XVector_0.22.0
## [4] Pbase_0.22.1 Gviz_1.26.4 GenomicRanges_1.34.0
## [7] GenomeInfoDb_1.18.1 IRanges_2.16.0 S4Vectors_0.20.1
## [10] Rcpp_1.0.0 BiocGenerics_0.28.0 BiocStyle_2.10.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 biovizBase_1.30.1
## [3] htmlTable_1.13 base64enc_0.1-3
## [5] dichromat_2.0-0 rstudioapi_0.8
## [7] mzR_2.16.1 affyio_1.52.0
## [9] bit64_0.9-7 AnnotationDbi_1.44.0
## [11] codetools_0.2-16 splines_3.5.2
## [13] ncdf4_1.16 doParallel_1.0.14
## [15] impute_1.56.0 knitr_1.21
## [17] Formula_1.2-3 Rsamtools_1.34.0
## [19] cluster_2.0.7-1 vsn_3.50.0
## [21] BiocManager_1.30.4 compiler_3.5.2
## [23] httr_1.4.0 backports_1.1.3
## [25] assertthat_0.2.0 Matrix_1.2-15
## [27] lazyeval_0.2.1 limma_3.38.3
## [29] acepack_1.4.1 htmltools_0.3.6
## [31] prettyunits_1.0.2 tools_3.5.2
## [33] bindrcpp_0.2.2 gtable_0.2.0
## [35] glue_1.3.0 GenomeInfoDbData_1.2.0
## [37] affy_1.60.0 dplyr_0.7.8
## [39] MALDIquant_1.18 Biobase_2.42.0
## [41] preprocessCore_1.44.0 rtracklayer_1.42.1
## [43] iterators_1.0.10 xfun_0.4
## [45] stringr_1.3.1 ensembldb_2.6.3
## [47] XML_3.98-1.16 zlibbioc_1.28.0
## [49] MASS_7.3-51.1 scales_1.0.0
## [51] BSgenome_1.50.0 MSnbase_2.8.3
## [53] VariantAnnotation_1.28.5 pcaMethods_1.74.0
## [55] hms_0.4.2 ProtGenerics_1.14.0
## [57] SummarizedExperiment_1.12.0 AnnotationFilter_1.6.0
## [59] RColorBrewer_1.1-2 yaml_2.2.0
## [61] curl_3.2 memoise_1.1.0
## [63] gridExtra_2.3 ggplot2_3.1.0
## [65] cleaver_1.20.0 biomaRt_2.38.0
## [67] rpart_4.1-13 latticeExtra_0.6-28
## [69] stringi_1.2.4 RSQLite_2.1.1
## [71] foreach_1.4.4 checkmate_1.8.5
## [73] GenomicFeatures_1.34.1 BiocParallel_1.16.5
## [75] rlang_0.3.0.1 pkgconfig_2.0.2
## [77] matrixStats_0.54.0 bitops_1.0-6
## [79] evaluate_0.12 lattice_0.20-38
## [81] purrr_0.2.5 bindr_0.1.1
## [83] GenomicAlignments_1.18.1 htmlwidgets_1.3
## [85] bit_1.1-14 tidyselect_0.2.5
## [87] plyr_1.8.4 magrittr_1.5
## [89] bookdown_0.9 Pviz_1.16.0
## [91] R6_2.3.0 Hmisc_4.1-1
## [93] DelayedArray_0.8.0 DBI_1.0.0
## [95] pillar_1.3.1 foreign_0.8-71
## [97] survival_2.43-3 RCurl_1.95-4.11
## [99] nnet_7.3-12 tibble_2.0.0
## [101] crayon_1.3.4 rmarkdown_1.11
## [103] progress_1.2.0 data.table_1.11.8
## [105] blob_1.1.1 digest_0.6.18
## [107] munsell_0.5.0