!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw
.
## Loading required package: DBI
## Loading required package: RSQLite
## Loading required package: seqinr
## Loading required package: ade4
## Loading required package: protViz
quantile
# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
newfrom <- NULL
for(from in windfrom){
idx <- which.min(abs(from - empty))
startmass <- 0
if(isfrom){
if(empty[idx] < from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx-1]
}
}else{
if(empty[idx] > from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx+1]
}
}
newfrom <- c(newfrom, round(startmass,digits=1))
}
return(newfrom)
}
.cdsw_compute_breaks <-
function(xx, nbins){
q <- quantile(xx, seq(0, 1, length = nbins + 1))
q[1] <- q[1] - 0.5
q[length(q)] <- q[length(q)] + 0.5
q <- round(q)
}
cdsw <-
function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
if (class(x) == "psmSet") {
x <- unlist(lapply(x, function(x) {
x$pepmass
}))
} else if (class(x) == 'specLSet') {
x <- unlist(lapply(x@ionlibrary, function(xx) {
xx@q1
}))
}
# x should be numeric
if (class(x) != "numeric") {
warning("can not compute quantils. 'x' is not numeric.")
return (NULL)
}
x <- x[x > massrange[1] & x < massrange[2]]
q <- FUN(xx=x, nbins=n)
idx <- 1:n
from <- q[idx] - overlap * 0.5
to <- q[idx + 1] + overlap * 0.5
width <- 0.5 * (to - from)
mid <- from + width
h <- hist(x, breaks = q, ...)
data.frame(from, to, mid, width, counts = h$counts)
}
cdsw(exampledata,
freq=TRUE,
overlap = 0,
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## from to mid width counts
## 0% 301 384 342.5 41.5 586
## 5% 384 420 402.0 18.0 591
## 10% 420 449 434.5 14.5 583
## 15% 449 476 462.5 13.5 599
## 20% 476 500 488.0 12.0 565
## 25% 500 524 512.0 12.0 604
## 30% 524 547 535.5 11.5 566
## 35% 547 572 559.5 12.5 598
## 40% 572 598 585.0 13.0 591
## 45% 598 625 611.5 13.5 577
## 50% 625 651 638.0 13.0 603
## 55% 651 682 666.5 15.5 575
## 60% 682 713 697.5 15.5 594
## 65% 713 745 729.0 16.0 574
## 70% 745 783 764.0 19.0 595
## 75% 783 825 804.0 21.0 572
## 80% 825 881 853.0 28.0 592
## 85% 881 948 914.5 33.5 583
## 90% 948 1045 996.5 48.5 592
## 95% 1045 1250 1147.5 102.5 586
.cdsw_objective <- function(splits, data){
counts <- hist(data, breaks=splits,plot=FALSE)$counts
nbins<-length(splits)-1
optimumN <- length(data)/(length(splits)-1)
optimumN<-rep(optimumN,nbins)
score2 <-sqrt(sum((counts - optimumN)^2))
score1 <- sum(abs(counts - round(optimumN)))
return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
difsp<-diff(splits)
return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}
.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
breaks <- nbins+1
#xx <- x
#xx<-xx[xx >=310 & xx<1250]
# TODO(wew): there is something insconsitent with the nbins parameter
qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
plot(1:breaks, qqs, type="b" ,
sub=".cdsw_compute_sampling_breaks")
legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
paste("nbins = ", breaks) ))
# equidistant spaced bins
unif <- seq(min(xx),max(xx),length=(breaks))
lines(1:breaks,unif,col=2,type="b")
if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
}else{
.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
}
mixeddata <- xx
it_count <- 0
error <- 0
while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
it_count <- it_count + 1
uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
mixeddata<-c(mixeddata,uniformdata)
qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
lines(1:breaks,qqs,type="l", col="#00DD00AA")
error[it_count] <-.cdsw_objective(qqs, xx)$score1
}
lines(1:breaks,qqs,type="b", col="#FF1111AA")
plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
qqs <- as.numeric(sort(round(qqs)))
qqs[1] <- qqs[1] - 0.5
qqs[length(qqs)] <- qqs[length(qqs)] + 0.5
round(qqs, 1)
}
op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
n=35,
massrange = c(350,1250),
sub='sampling based',
main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
res <- hist(ms1data, breaks=breaks)
abline(v=wind$from,col=2,lty=2)
abline(v=wind$to,col=3,lty=2)
empty <- res$mids[which(res$counts < maxbin )]
newfrom <- .makenewfromto(wind$from , empty)
newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
plot(res,xlim=c(1060,1065))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
plot(res,xlim=c(520,550))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
width <- (newto - newfrom) * 0.5
mid <- (newfrom + newto)*0.5
newCounts <- NULL
for(i in 1:length(newfrom))
{
newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
}
data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)
## newfrom newto mid width counts
## 1 349.5 379.1 364.30 14.80 305
## 2 379.0 401.1 390.05 11.05 323
## 3 401.0 422.1 411.55 10.55 365
## 4 422.0 441.1 431.55 9.55 398
## 5 441.0 461.1 451.05 10.05 401
## 6 461.0 478.1 469.55 8.55 404
## 7 478.0 496.1 487.05 9.05 435
## 8 496.0 512.0 504.00 8.00 402
## 9 512.0 530.0 521.00 9.00 423
## 10 530.0 546.0 538.00 8.00 410
## 11 546.0 564.0 555.00 9.00 433
## 12 564.0 581.0 572.50 8.50 393
## 13 580.8 599.0 589.90 9.10 422
## 14 599.0 618.0 608.50 9.50 416
## 15 618.0 636.0 627.00 9.00 403
## 16 636.0 655.0 645.50 9.50 422
## 17 655.0 676.0 665.50 10.50 371
## 18 676.0 696.0 686.00 10.00 416
## 19 696.0 716.0 706.00 10.00 361
## 20 716.0 737.0 726.50 10.50 369
## 21 737.0 758.0 747.50 10.50 383
## 22 758.0 782.0 770.00 12.00 336
## 23 782.0 806.0 794.00 12.00 347
## 24 806.0 831.0 818.50 12.50 321
## 25 831.0 861.0 846.00 15.00 313
## 26 861.0 891.0 876.00 15.00 281
## 27 891.0 921.1 906.05 15.05 287
## 28 920.9 954.1 937.50 16.60 274
## 29 954.0 991.1 972.55 18.55 246
## 30 991.0 1028.0 1009.50 18.50 205
## 31 1028.0 1067.0 1047.50 19.50 199
## 32 1067.0 1106.0 1086.50 19.50 182
## 33 1106.0 1153.0 1129.50 23.50 126
## 34 1153.0 1200.0 1176.50 23.50 98
## 35 1200.0 1249.5 1224.75 24.75 71
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=35,
overlap = 0,
sub='quantile based',
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
## from to mid width counts
## 0% 301 363 332.0 31.0 332
## 2.857143% 363 390 376.5 13.5 343
## 5.714286% 390 410 400.0 10.0 328
## 8.571429% 410 429 419.5 9.5 331
## 11.42857% 429 445 437.0 8.0 339
## 14.28571% 445 462 453.5 8.5 347
## 17.14286% 462 476 469.0 7.0 339
## 20% 476 489 482.5 6.5 315
## 22.85714% 489 503 496.0 7.0 333
## 25.71429% 503 517 510.0 7.0 353
## 28.57143% 517 531 524.0 7.0 338
## 31.42857% 531 544 537.5 6.5 316
## 34.28571% 544 557 550.5 6.5 337
## 37.14286% 557 572 564.5 7.5 341
## 40% 572 586 579.0 7.0 337
## 42.85714% 586 601 593.5 7.5 324
## 45.71429% 601 617 609.0 8.0 350
## 48.57143% 617 632 624.5 7.5 332
## 51.42857% 632 646 639.0 7.0 321
## 54.28571% 646 663 654.5 8.5 342
## 57.14286% 663 682 672.5 9.5 340
## 60% 682 698 690.0 8.0 336
## 62.85714% 698 716 707.0 9.0 323
## 65.71429% 716 736 726.0 10.0 350
## 68.57143% 736 754 745.0 9.0 328
## 71.42857% 754 776 765.0 11.0 335
## 74.28571% 776 800 788.0 12.0 336
## 77.14286% 800 825 812.5 12.5 327
## 80% 825 855 840.0 15.0 344
## 82.85714% 855 891 873.0 18.0 330
## 85.71429% 891 928 909.5 18.5 334
## 88.57143% 928 969 948.5 20.5 340
## 91.42857% 969 1029 999.0 30.0 337
## 94.28571% 1029 1097 1063.0 34.0 332
## 97.14286% 1097 1250 1173.5 76.5 336
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
main = paste("max window size", mws), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = mws)
})
})
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=nbins,
overlap = 0,
main = paste("nr bins", nbins), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = 100)
})
})
Here is the output of sessionInfo()
on the system on which this document was compiled:
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] specL_1.6.2 protViz_0.2.15 seqinr_3.1-3 ade4_1.7-4
## [5] RSQLite_1.0.0 DBI_0.4-1 BiocStyle_2.0.2
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.5 digest_0.6.9 formatR_1.4 magrittr_1.5
## [5] evaluate_0.9 stringi_1.0-1 rmarkdown_0.9.6 tools_3.3.0
## [9] stringr_1.0.0 yaml_2.1.13 htmltools_0.3.5 knitr_1.13