Title: | Minimal Protein Set Explaining Peptide Spectrum Matches |
---|---|
Description: | Determine minimal protein set explaining peptide spectrum matches. Utility functions for creating fasta amino acid databases with decoys and contaminants. Peptide false discovery rate estimation for target decoy search results on psm, precursor, peptide and protein level. Computing dynamic swath window sizes based on MS1 or MS2 signal distributions. |
Authors: | Witold Wolski [aut, cre] |
Maintainer: | Witold Wolski <[email protected]> |
License: | GPL-3 |
Version: | 0.3.4 |
Built: | 2024-11-12 11:14:55 UTC |
Source: | https://github.com/protviz/prozor |
peptides which do not have protein assignment drop out
annotateAHO(pepseq, fasta, peptide = "peptideSeq")
annotateAHO(pepseq, fasta, peptide = "peptideSeq")
pepseq |
- list of peptides - sequence, optional modified sequence, charge state. |
fasta |
- object as created by |
A data.frame with proteinID, peptideSeq, Offset and proteinSequence
library(dplyr) file = system.file("extdata/IDResults.txt.gz" , package = "prozor") specMeta <- readr::read_tsv(file) upeptide <- unique(specMeta$peptideSeq) resCan <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor")) resCanU <- resCan[!duplicated(unlist(resCan))] annotAll = annotateAHO(upeptide[seq_len(20)], resCanU) dim(annotAll)
library(dplyr) file = system.file("extdata/IDResults.txt.gz" , package = "prozor") specMeta <- readr::read_tsv(file) upeptide <- unique(specMeta$peptideSeq) resCan <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor")) resCanU <- resCan[!duplicated(unlist(resCan))] annotAll = annotateAHO(upeptide[seq_len(20)], resCanU) dim(annotAll)
peptides which do not have protein assignment drop out
annotatePeptides( pepinfo, fasta, peptide = "peptideSeq", prefix = "(([RK])|(^)|(^M))", suffix = "" )
annotatePeptides( pepinfo, fasta, peptide = "peptideSeq", prefix = "(([RK])|(^)|(^M))", suffix = "" )
pepinfo |
- list of peptides - sequence, optional modified sequence, charge state. |
fasta |
- object as created by readPeptideFasta |
peptide |
- name of column containing peptide sequences default "peptideSeq" |
prefix |
- default "(([RK])|(^)|(^M))" |
suffix |
- default "" |
data.frame with columns "peptideSeq", "proteinID","Offset","proteinSequence","matched", "lengthPeptide","proteinlength"
library(dplyr) file = system.file("extdata/IDResults.txt.gz" , package = "prozor") specMeta <- readr::read_tsv(file) upeptide <- unique(specMeta$peptideSeq) resCan <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor")) annotAll = prozor::annotatePeptides(upeptide[seq_len(20)], resCan) dim(annotAll) res <- mutate(annotAll, proteinlength = nchar(proteinSequence)) res <- select(res, proteinID, peptideSeq, proteinlength, Offset, lengthPeptide) head(res)
library(dplyr) file = system.file("extdata/IDResults.txt.gz" , package = "prozor") specMeta <- readr::read_tsv(file) upeptide <- unique(specMeta$peptideSeq) resCan <- prozor::readPeptideFasta( system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor")) annotAll = prozor::annotatePeptides(upeptide[seq_len(20)], resCan) dim(annotAll) res <- mutate(annotAll, proteinlength = nchar(proteinSequence)) res <- select(res, proteinID, peptideSeq, proteinlength, Offset, lengthPeptide) head(res)
Data frame as produced by COMET-MS search engine
initialize
create equidistant breaks
quantile breaks
sampling breaks
barplot showing the number of precursors per window
Table with window boundaries and statistics
summary of the binning process (see objectiveMS1Function for more details)
moves window start and end to region with as few as possible precursor masses
shows the generated DIA cycle
list |
of masses |
nbins |
number of bins default 25 |
maxwindow |
largest window size |
minwindow |
smallest window size |
digits |
mass precision default 2 |
digigits |
mass precision |
max |
number of bins |
plot |
default TRUE |
overlap |
size of window overlap default 1 m/z |
array of masses
array with masses
array with masses
data.frame with columns: - from (window start) - to (window end) - mid (window centre), width (window width) - counts expected number of precursors
list with optimization scores
data.frame with optimized windows
masses
MS1 masses
breaks
the breaks
nbins
number of bins
digits
mass accuracy in result
asTable(overlap = 1)
make windows
error()
show error
optimizeWindows(digits = 1, maxbin = 15, plot = FALSE, overlap = 0)
optimizes the windows
quantile_breaks(digits = 2)
same number of MS1 in each window but might violate hard constraints
sampling_breaks(maxwindow = 150, minwindow = 5, digits = 2, plot = FALSE)
starts with quantile breaks but mixes with uniform data to satisfy had constraints
data(masses) cdsw <- Cdsw(masses) tmp <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE) cdsw$plot() cdsw$asTable() cdsw$breaks cdsw$optimizeWindows() cdsw$showCycle()
data(masses) cdsw <- Cdsw(masses) tmp <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE) cdsw$plot() cdsw$asTable() cdsw$breaks cdsw$optimizeWindows() cdsw$showCycle()
Same as computeFDRwithID but works with decoy_hit boolean vector.
For more details and references see package vignette
vignette("TargetDecoyFDR_Example", package = "prozor")
computeFDR(score, decoy_hit, larger_better = TRUE)
computeFDR(score, decoy_hit, larger_better = TRUE)
score |
score |
decoy_hit |
indicates if decoy hit |
larger_better |
is larger score the better one (default TRUE) |
list with decoy_hit (indicates if decoy), score the search engine score, FDR1 false discovery rate estimated using the method of Gygi, SimpleFDR - estimated using the method of Kaell.
data(fdrSample) fdr1 <- computeFDR(fdrSample$score, grepl("REV_",fdrSample$proteinID), larger_better = FALSE) head(as.data.frame(fdr1))
data(fdrSample) fdr1 <- computeFDR(fdrSample$score, grepl("REV_",fdrSample$proteinID), larger_better = FALSE) head(as.data.frame(fdr1))
For more details and references see package vignette
vignette("TargetDecoyFDR_Example", package = "prozor")
computeFDRwithID(score, ID, decoy = "REV_", larger_better = TRUE)
computeFDRwithID(score, ID, decoy = "REV_", larger_better = TRUE)
score |
a vector with scores |
ID |
- list with protein id's |
decoy |
decoy pattern, default "REV_" |
larger_better |
if larger score better than small (default TRUE), If small score better set FALSE |
list with ID, decoy_hit (indicates if decoy), score the search engine score, FDR1 false discovery rate estimated using the method of Elias and Gygi; FDR2 - estimated using the method of Kell.
data(fdrSample) # call constructor #nrow(fdrSample) #fdrSample <- dplyr::slice_sample(fdrSample, n = 40000) #usethis::use_data(fdrSample, overwrite = TRUE) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) names(fdr1) plot(fdr1$score, fdr1$FPR,type="l",xlim=c(0,0.001), ylim=c(0,0.0002)) lines(fdr1$score, fdr1$qValue_FPR, col=2) lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4) lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5) fdr1 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) names(fdr1) plot(fdr1$score, fdr1$FPR,type="l", xlim=c(2.5,5),ylim=c(0,0.001)) lines(fdr1$score, fdr1$qValue_FPR, col=2) lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4) lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)
data(fdrSample) # call constructor #nrow(fdrSample) #fdrSample <- dplyr::slice_sample(fdrSample, n = 40000) #usethis::use_data(fdrSample, overwrite = TRUE) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) names(fdr1) plot(fdr1$score, fdr1$FPR,type="l",xlim=c(0,0.001), ylim=c(0,0.0002)) lines(fdr1$score, fdr1$qValue_FPR, col=2) lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4) lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5) fdr1 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) names(fdr1) plot(fdr1$score, fdr1$FPR,type="l", xlim=c(2.5,5),ylim=c(0,0.001)) lines(fdr1$score, fdr1$qValue_FPR, col=2) lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4) lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)
create fasta db from one or more fasta files
create_fgcz_fasta_db( databasedirectory, useContaminants = loadContaminantsFGCZ2022(), revLab = "REV_", outputdir = NULL, make_summary = TRUE )
create_fgcz_fasta_db( databasedirectory, useContaminants = loadContaminantsFGCZ2022(), revLab = "REV_", outputdir = NULL, make_summary = TRUE )
databasedirectory |
directory with fasta files |
useContaminants |
contaminants to add |
revLab |
reverse label |
outputdir |
output directory |
list list(resDB, filepath , summary, mcall, dbname)
print("NO exmple since function also writes the fasta files")
print("NO exmple since function also writes the fasta files")
For more details and references see package vignette
vignette("CreateDecoyDB", package = "prozor")
createDecoyDB( dbs, useContaminants = loadContaminantsFGCZ2022(), revLab = "REV_", annot = "zz|sourceOf|database" )
createDecoyDB( dbs, useContaminants = loadContaminantsFGCZ2022(), revLab = "REV_", annot = "zz|sourceOf|database" )
dbs |
a path to a fasta file or an array of files |
useContaminants |
list with contaminant sequences |
revLab |
label for reversed peptides (if NULL do not generate decoys) |
annot |
source of database |
list of SeqFastaAA entries
file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor") cont <- loadContaminantsFGCZ2022() rabbit <-readPeptideFasta(file) tmp <- 2*(2*length(rabbit)+length(cont)) + 1 res <- createDecoyDB(c(file,file)) length(res) stopifnot(length(res) == tmp) res <- createDecoyDB(c(file,file), revLab=NULL) stopifnot(length(res) == (2*length(rabbit)+length(cont) + 1)) res <- createDecoyDB(c(file,file), revLab=NULL, useContaminants = NULL) stopifnot(length(res) == (2*length(rabbit) + 1) )
file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor") cont <- loadContaminantsFGCZ2022() rabbit <-readPeptideFasta(file) tmp <- 2*(2*length(rabbit)+length(cont)) + 1 res <- createDecoyDB(c(file,file)) length(res) stopifnot(length(res) == tmp) res <- createDecoyDB(c(file,file), revLab=NULL) stopifnot(length(res) == (2*length(rabbit)+length(cont) + 1)) res <- createDecoyDB(c(file,file), revLab=NULL, useContaminants = NULL) stopifnot(length(res) == (2*length(rabbit) + 1) )
given matrix (columns protein rows peptides), compute minimal protein set using greedy algorithm
greedy_parsimony(pepprot)
greedy_parsimony(pepprot)
pepprot |
matrix as returned by prepareMatrix |
list of peptide protein assignment
#library(prozor) data(protpepmetashort) colnames(protpepmetashort) dim(unique(protpepmetashort[,4])) xx = prepareMatrix(protpepmetashort, peptideID = "peptideModSeq") dim(xx) stopifnot(dim(xx)[1] == dim(unique(protpepmetashort[,4]))[1]) es = greedy_parsimony(xx) debug(prozor:::.greedy2) stopifnot(length(unique(names(es))) == dim(unique(protpepmetashort[,4]))[1])
#library(prozor) data(protpepmetashort) colnames(protpepmetashort) dim(unique(protpepmetashort[,4])) xx = prepareMatrix(protpepmetashort, peptideID = "peptideModSeq") dim(xx) stopifnot(dim(xx)[1] == dim(unique(protpepmetashort[,4]))[1]) es = greedy_parsimony(xx) debug(prozor:::.greedy2) stopifnot(length(unique(names(es))) == dim(unique(protpepmetashort[,4]))[1])
converts result of greedy_parsimony function to a matrix with 3 columns - peptide - charge and protein
greedyRes2Matrix(res)
greedyRes2Matrix(res)
res |
result of function prozor::greedy_parsimony |
matrix of peptide protein assignments
load special prot
load_special_prot_2024(noHuman = FALSE)
load_special_prot_2024(noHuman = FALSE)
noHuman |
should human contaminants be excluded? default FALSE |
list with contaminant sequences
#library(prozor) cont <- load_special_prot_2024() length(cont) contNH <- load_special_prot_2024(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
#library(prozor) cont <- load_special_prot_2024() length(cont) contNH <- load_special_prot_2024(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
These sequences are downloaded from here https://github.com/HaoGroup-ProtContLib/Protein-Contaminant-Libraries-for-DDA-and-DIA-Proteomics
load_universal_contaminants_2024(noHuman = FALSE)
load_universal_contaminants_2024(noHuman = FALSE)
noHuman |
should human contaminants be excluded? default FALSE |
list with contaminant sequences
#library(prozor) cont <- load_universal_contaminants_2024() length(cont) contNH <- load_universal_contaminants_2024(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
#library(prozor) cont <- load_universal_contaminants_2024() length(cont) contNH <- load_universal_contaminants_2024(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
load list of contaminant sequences FGCZ 2019
loadContaminantsFasta2019(noHuman = FALSE)
loadContaminantsFasta2019(noHuman = FALSE)
noHuman |
should human contaminants be excluded? default FALSE |
list with contaminant sequences
#library(prozor) cont <- loadContaminantsFasta2019() length(cont) contNH <- loadContaminantsFasta2019() length(contNH) #example how to create a protein db with decoy sequences
#library(prozor) cont <- loadContaminantsFasta2019() length(cont) contNH <- loadContaminantsFasta2019() length(contNH) #example how to create a protein db with decoy sequences
load list of contaminant sequences FGCZ 2021
loadContaminantsFasta2021(noHuman = FALSE)
loadContaminantsFasta2021(noHuman = FALSE)
noHuman |
should human contaminants be excluded? default FALSE |
list with contaminant sequences
#library(prozor) cont <- loadContaminantsFasta2021() length(cont) contNH <- loadContaminantsFasta2021(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
#library(prozor) cont <- loadContaminantsFasta2021() length(cont) contNH <- loadContaminantsFasta2021(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
load list of contaminant sequences FGCZ 2022
loadContaminantsFGCZ2022(noHuman = FALSE)
loadContaminantsFGCZ2022(noHuman = FALSE)
noHuman |
should human contaminants be excluded? default FALSE |
list with contaminant sequences
#library(prozor) cont <- loadContaminantsFGCZ2022() length(cont) contNH <- loadContaminantsFGCZ2022(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
#library(prozor) cont <- loadContaminantsFGCZ2022() length(cont) contNH <- loadContaminantsFGCZ2022(noHuman = TRUE) length(contNH) #example how to create a protein db with decoy sequences
make db summary which includes number of sequences, and amino acid statistcis
make_fasta_summary(resDB, old = FALSE, as_string = TRUE)
make_fasta_summary(resDB, old = FALSE, as_string = TRUE)
array of strings which should be passed to the cat function.
file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor") xx <- prozor::readPeptideFasta(file) cat(make_fasta_summary(xx)) rbenchmark::benchmark(make_fasta_summary(xx),replications = 10) rbenchmark::benchmark(make_fasta_summary(xx, old = TRUE), replications = 10) make_fasta_summary(xx, as_string = FALSE) make_fasta_summary(xx, old =TRUE, as_string = FALSE)
file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor") xx <- prozor::readPeptideFasta(file) cat(make_fasta_summary(xx)) rbenchmark::benchmark(make_fasta_summary(xx),replications = 10) rbenchmark::benchmark(make_fasta_summary(xx, old = TRUE), replications = 10) make_fasta_summary(xx, as_string = FALSE) make_fasta_summary(xx, old =TRUE, as_string = FALSE)
make id for chain in format sp|P30443|1A01_HUMANs25
makeID(sequence, id, sp)
makeID(sequence, id, sp)
sequence |
- aa sequence as string |
id |
uniprot id id: sp|P30443|1A01_HUMAN |
sp |
start position of chain numeric |
string consisting of id,"s",sp
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\ GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\ MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\ DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\ VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\ TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\ HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\ SSDRKGGSYTQAASSDSAQGSDVSLTACKV" nam <-"sp|P30443|1A01_HUMAN" sp <- 24 makeID(seq, nam, sp)
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\ GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\ MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\ DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\ VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\ TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\ HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\ SSDRKGGSYTQAASSDSAQGSDVSLTACKV" nam <-"sp|P30443|1A01_HUMAN" sp <- 24 makeID(seq, nam, sp)
make id for chain compatible with uniprot
makeIDUnip(sequence, id, sp)
makeIDUnip(sequence, id, sp)
sequence |
- aa sequence as string |
id |
uniprot id id: sp|P30443|1A01_HUMAN |
sp |
start position of chain numeric |
string consisting of sp,"-", length of sequnce
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\ GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\ MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\ DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\ VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\ TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\ HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\ SSDRKGGSYTQAASSDSAQGSDVSLTACKV" nam <-"sp|P30443|1A01_HUMAN" sp <- 24 makeIDUnip(seq, nam, sp)
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\ GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\ MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\ DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\ VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\ TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\ HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\ SSDRKGGSYTQAASSDSAQGSDVSLTACKV" nam <-"sp|P30443|1A01_HUMAN" sp <- 24 makeIDUnip(seq, nam, sp)
MS masses A dataset containing approx 150000 MS1 precursor masses
compute the deviation from optimum: equal number of MS1 per bin
objectiveMS1Function(splits, data)
objectiveMS1Function(splits, data)
splits |
the new window boundaries |
data |
the data |
list with score1 - manhattan distance, score2 - euclidean distance, counts - observed counts, optimumN - optimum counts
For more details and references see package vignette
vignette("TargetDecoyFDR_Example", package = "prozor")
plotFDR(data)
plotFDR(data)
data |
data returned by computeFDR function |
creates a plot
#library(prozor) data(fdrSample) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) fdr2 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) plotFDR(fdr1) plotFDR(fdr2) data<-fdr1
#library(prozor) data(fdrSample) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) fdr2 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) plotFDR(fdr1) plotFDR(fdr2) data<-fdr1
For more details and references see package vignette
vignette("TargetDecoyFDR_Example", package = "prozor")
predictScoreFDR(fdrObj, qValue = 1, method = "SimpleFDR")
predictScoreFDR(fdrObj, qValue = 1, method = "SimpleFDR")
fdrObj |
object generated by computeFDR |
qValue |
false discovery rate in percent, default 1 percent |
method |
either FPR or SimpleFDR, default is SimpleFDR |
score for a given FDR
data(fdrSample) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) predictScoreFDR(fdr1,qValue=5) fdr2<-computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) predictScoreFDR(fdr2,qValue=5)
data(fdrSample) fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE) predictScoreFDR(fdr1,qValue=5) fdr2<-computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE) predictScoreFDR(fdr2,qValue=5)
given table of peptide protein assigments generate matrix
prepareMatrix( data, proteinID = "proteinID", peptideID = "strippedSequence", weighting = NULL, sep = "|", use.last.ij = TRUE )
prepareMatrix( data, proteinID = "proteinID", peptideID = "strippedSequence", weighting = NULL, sep = "|", use.last.ij = TRUE )
data |
generated by annotatePeptides |
proteinID |
protein ID column |
peptideID |
peptide / precursor ID column |
weighting |
weight type to use. Options are "one" , "AA" - amino acids, "coverage" - coverage , "inverse" - inverse peptide frequencies |
sep |
separator for precursor (rownames) |
sparse matrix
#library(prozor) data(protpepmetashort) library(Matrix) colnames(protpepmetashort) head(protpepmetashort) dim(protpepmetashort) count = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" ) dim(count) inverse = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" , weight = "inverse") #aa = prepareMatrix(protpepmetashort, peptideID = "peptideSeq" , weight = "AA") #xx = prepareMatrix(protpepmetashort, peptideID = "peptideSeq" , weight = "coverage") image( as.matrix(count) ) corProt = cor( as.matrix(count) ) par(mfrow =c(1,2)) image(corProt) #penalise peptides matching many proteins corProtn = cor( as.matrix(inverse) ) image(corProtn)
#library(prozor) data(protpepmetashort) library(Matrix) colnames(protpepmetashort) head(protpepmetashort) dim(protpepmetashort) count = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" ) dim(count) inverse = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" , weight = "inverse") #aa = prepareMatrix(protpepmetashort, peptideID = "peptideSeq" , weight = "AA") #xx = prepareMatrix(protpepmetashort, peptideID = "peptideSeq" , weight = "coverage") image( as.matrix(count) ) corProt = cor( as.matrix(count) ) par(mfrow =c(1,2)) image(corProt) #penalise peptides matching many proteins corProtn = cor( as.matrix(inverse) ) image(corProtn)
Small version of pepprot dataset to speed up computation
Determine minimal protein set explaining peptide spectrum matches. Utility functions for creating fasta amino acid databases with decoys and contaminants. Peptide false discovery rate estimation for target decoy search results on psm, precursor, peptide and protein level. Computing dynamic swath window sizes based on MS1 and MS2 signal distributions.
Maintainer: Witold Wolski [email protected] (ORCID)
Useful links:
Readjust windows so that boundaries in regions of few peaks.
readjustWindows(wind, ms1data, digits = 1, maxbin = 15, plot = FALSE)
readjustWindows(wind, ms1data, digits = 1, maxbin = 15, plot = FALSE)
wind |
a data frame with columns from and to |
ms1data |
masses |
digits |
mass accuracy |
maxbin |
maximum number of bins |
plot |
diagnostic plots (default FALSE) |
data.frame of same format as wind
but with improved start and end masses.
data(masses) cdsw <- Cdsw(masses) breaks <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE) table <- cdsw$asTable() dim(table) head(table) tmp <- readjustWindows(table, masses,maxbin=10) data.frame(tmp)
data(masses) cdsw <- Cdsw(masses) breaks <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE) table <- cdsw$asTable() dim(table) head(table) tmp <- readjustWindows(table, masses,maxbin=10) data.frame(tmp)
peptides which do not have protein assignment drop out
readPeptideFasta(file)
readPeptideFasta(file)
file |
- fasta file |
list with sequences
library(seqinr) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor") fasta = readPeptideFasta(file)
library(seqinr) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor") fasta = readPeptideFasta(file)
remove signal peptides from main chain
removeSignalPeptide(db, signal, idfun = makeID)
removeSignalPeptide(db, signal, idfun = makeID)
db |
uniprot fasta database as list |
signal |
tab delimited file with signals |
idfun |
function to generate id's |
list with sequences with signal peptide removed
peptides which do not have protein assignment drop out
reverseSeq(fasta, revLab = "REV_")
reverseSeq(fasta, revLab = "REV_")
fasta |
- an r list with SeqFastaAA |
revLab |
- how to label reverse sequences, default = REV_ |
string with reversed sequence
library(seqinr) #library(prozor) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz", package="prozor") fasta = readPeptideFasta(file = file) getAnnot(fasta[[1]]) x <- reverseSeq(fasta) revseq <- reverseSeq(fasta ,revLab = "REV_") stopifnot(length(revseq) == length(fasta)) stopifnot(grep("^REV_","REV_zz|ZZ_FGCZCont0000|")==1) tmp <- list(as.SeqFastaAA(("DYKDDDDK"),name="Flag|FLAG|p2079",Annot="")) reverseSeq(tmp)
library(seqinr) #library(prozor) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz", package="prozor") fasta = readPeptideFasta(file = file) getAnnot(fasta[[1]]) x <- reverseSeq(fasta) revseq <- reverseSeq(fasta ,revLab = "REV_") stopifnot(length(revseq) == length(fasta)) stopifnot(grep("^REV_","REV_zz|ZZ_FGCZCont0000|")==1) tmp <- list(as.SeqFastaAA(("DYKDDDDK"),name="Flag|FLAG|p2079",Annot="")) reverseSeq(tmp)
peptides which do not have protein assignment drop out
writeFasta(file, ...)
writeFasta(file, ...)
file |
where to write |
... |
fasta list or single file |
writes a file.
#example how to create a protein db with decoy sequences library(seqinr) #library(prozor) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor") fasta = readPeptideFasta(file = file) revfasta <- reverseSeq(fasta) decoyDB <- c(fasta,revfasta) stopifnot(length(decoyDB) == 2 * length(fasta)) ## Not run: writeFasta(decoyDB, file="test.fasta") ## End(Not run)
#example how to create a protein db with decoy sequences library(seqinr) #library(prozor) file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor") fasta = readPeptideFasta(file = file) revfasta <- reverseSeq(fasta) decoyDB <- c(fasta,revfasta) stopifnot(length(decoyDB) == 2 * length(fasta)) ## Not run: writeFasta(decoyDB, file="test.fasta") ## End(Not run)