Package 'prozor' reference manual

Title:	Minimal Protein Set Explaining Peptide Spectrum Matches
Description:	Determine minimal protein set explaining peptide spectrum matches. Utility functions for creating fasta amino acid databases with decoys and contaminants. Peptide false discovery rate estimation for target decoy search results on psm, precursor, peptide and protein level. Computing dynamic swath window sizes based on MS1 or MS2 signal distributions.
Authors:	Witold Wolski [aut, cre]
Maintainer:	Witold Wolski <[email protected]>
License:	GPL-3
Version:	0.3.4
Built:	2025-03-23 02:51:32 UTC
Source:	https://github.com/protviz/prozor

annotate peptides using AhoCorasickTrie

Description

peptides which do not have protein assignment drop out

Usage

annotateAHO(pepseq, fasta, peptide = "peptideSeq")
annotateAHO(pepseq, fasta, peptide = "peptideSeq")

Arguments

`pepseq`	- list of peptides - sequence, optional modified sequence, charge state.
`fasta`	- object as created by `readPeptideFasta`

Value

A data.frame with proteinID, peptideSeq, Offset and proteinSequence

Examples


library(dplyr)

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
upeptide <- unique(specMeta$peptideSeq)
resCan <-
   prozor::readPeptideFasta(
       system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor"))
resCanU <- resCan[!duplicated(unlist(resCan))]
annotAll = annotateAHO(upeptide[seq_len(20)], resCanU)
dim(annotAll)

library(dplyr)

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
upeptide <- unique(specMeta$peptideSeq)
resCan <-
   prozor::readPeptideFasta(
       system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor"))
resCanU <- resCan[!duplicated(unlist(resCan))]
annotAll = annotateAHO(upeptide[seq_len(20)], resCanU)
dim(annotAll)

Annotate peptides with protein ids

Description

peptides which do not have protein assignment drop out

Usage

annotatePeptides(
  pepinfo,
  fasta,
  peptide = "peptideSeq",
  prefix = "(([RK])|(^)|(^M))",
  suffix = ""
)
annotatePeptides(
  pepinfo,
  fasta,
  peptide = "peptideSeq",
  prefix = "(([RK])|(^)|(^M))",
  suffix = ""
)

Arguments

`pepinfo`	- list of peptides - sequence, optional modified sequence, charge state.
`fasta`	- object as created by readPeptideFasta
`peptide`	- name of column containing peptide sequences default "peptideSeq"
`prefix`	- default "(([RK])\|(^)\|(^M))"
`suffix`	- default ""

Value

data.frame with columns "peptideSeq", "proteinID","Offset","proteinSequence","matched", "lengthPeptide","proteinlength"

Examples


library(dplyr)

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
upeptide <- unique(specMeta$peptideSeq)
resCan <-
   prozor::readPeptideFasta(
       system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor"))

annotAll = prozor::annotatePeptides(upeptide[seq_len(20)], resCan)
dim(annotAll)

res <-  mutate(annotAll, proteinlength = nchar(proteinSequence))
res <-  select(res, proteinID, peptideSeq, proteinlength, Offset, lengthPeptide)
head(res)
library(dplyr)

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
upeptide <- unique(specMeta$peptideSeq)
resCan <-
   prozor::readPeptideFasta(
       system.file("p1000_db1_example/Annotation_canSeq.fasta.gz" , package = "prozor"))

annotAll = prozor::annotatePeptides(upeptide[seq_len(20)], resCan)
dim(annotAll)

res <-  mutate(annotAll, proteinlength = nchar(proteinSequence))
res <-  select(res, proteinID, peptideSeq, proteinlength, Offset, lengthPeptide)
head(res)

Data frame as produced by COMET-MS search engine

Description

Data frame as produced by COMET-MS search engine

Compute dynamic swath windows

Description

initialize

create equidistant breaks

quantile breaks

sampling breaks

barplot showing the number of precursors per window

Table with window boundaries and statistics

summary of the binning process (see objectiveMS1Function for more details)

moves window start and end to region with as few as possible precursor masses

shows the generated DIA cycle

Arguments

`list`	of masses
`nbins`	number of bins default 25
`maxwindow`	largest window size
`minwindow`	smallest window size
`digits`	mass precision default 2
`digigits`	mass precision
`max`	number of bins
`plot`	default TRUE
`overlap`	size of window overlap default 1 m/z

Value

array of masses

array with masses

data.frame with columns: - from (window start) - to (window end) - mid (window centre), width (window width) - counts expected number of precursors

list with optimization scores

data.frame with optimized windows

Fields

masses: MS1 masses
breaks: the breaks
nbins: number of bins
digits: mass accuracy in result

Methods

asTable(overlap = 1): make windows
error(): show error
optimizeWindows(digits = 1, maxbin = 15, plot = FALSE, overlap = 0): optimizes the windows
quantile_breaks(digits = 2): same number of MS1 in each window but might violate hard constraints
sampling_breaks(maxwindow = 150, minwindow = 5, digits = 2, plot = FALSE): starts with quantile breaks but mixes with uniform data to satisfy had constraints

Examples

data(masses)
cdsw <- Cdsw(masses)
tmp <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE)
cdsw$plot()
cdsw$asTable()
cdsw$breaks
cdsw$optimizeWindows()
cdsw$showCycle()
data(masses)
cdsw <- Cdsw(masses)
tmp <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE)
cdsw$plot()
cdsw$asTable()
cdsw$breaks
cdsw$optimizeWindows()
cdsw$showCycle()

Compute FDR given a score

Description

Same as computeFDRwithID but works with decoy_hit boolean vector. For more details and references see package vignette vignette("TargetDecoyFDR_Example", package = "prozor")

Usage

computeFDR(score, decoy_hit, larger_better = TRUE)
computeFDR(score, decoy_hit, larger_better = TRUE)

Arguments

`score`	score
`decoy_hit`	indicates if decoy hit
`larger_better`	is larger score the better one (default TRUE)

Value

list with decoy_hit (indicates if decoy), score the search engine score, FDR1 false discovery rate estimated using the method of Gygi, SimpleFDR - estimated using the method of Kaell.

Examples

data(fdrSample)

fdr1 <- computeFDR(fdrSample$score, grepl("REV_",fdrSample$proteinID), larger_better = FALSE)
head(as.data.frame(fdr1))

data(fdrSample)

fdr1 <- computeFDR(fdrSample$score, grepl("REV_",fdrSample$proteinID), larger_better = FALSE)
head(as.data.frame(fdr1))

Compute FDR given a score

Description

For more details and references see package vignette vignette("TargetDecoyFDR_Example", package = "prozor")

Usage

computeFDRwithID(score, ID, decoy = "REV_", larger_better = TRUE)
computeFDRwithID(score, ID, decoy = "REV_", larger_better = TRUE)

Arguments

`score`	a vector with scores
`ID`	- list with protein id's
`decoy`	decoy pattern, default "REV_"
`larger_better`	if larger score better than small (default TRUE), If small score better set FALSE

Value

list with ID, decoy_hit (indicates if decoy), score the search engine score, FDR1 false discovery rate estimated using the method of Elias and Gygi; FDR2 - estimated using the method of Kell.

Examples

data(fdrSample)
# call constructor
#nrow(fdrSample)
#fdrSample <- dplyr::slice_sample(fdrSample, n = 40000)

#usethis::use_data(fdrSample, overwrite = TRUE)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)
names(fdr1)
plot(fdr1$score, fdr1$FPR,type="l",xlim=c(0,0.001), ylim=c(0,0.0002))
lines(fdr1$score, fdr1$qValue_FPR, col=2)
lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4)
lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)


fdr1 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
names(fdr1)
plot(fdr1$score, fdr1$FPR,type="l", xlim=c(2.5,5),ylim=c(0,0.001))
lines(fdr1$score, fdr1$qValue_FPR, col=2)
lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4)
lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)

data(fdrSample)
# call constructor
#nrow(fdrSample)
#fdrSample <- dplyr::slice_sample(fdrSample, n = 40000)

#usethis::use_data(fdrSample, overwrite = TRUE)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)
names(fdr1)
plot(fdr1$score, fdr1$FPR,type="l",xlim=c(0,0.001), ylim=c(0,0.0002))
lines(fdr1$score, fdr1$qValue_FPR, col=2)
lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4)
lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)


fdr1 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
names(fdr1)
plot(fdr1$score, fdr1$FPR,type="l", xlim=c(2.5,5),ylim=c(0,0.001))
lines(fdr1$score, fdr1$qValue_FPR, col=2)
lines(fdr1$score, fdr1$SimpleFDR,type="l",col=4)
lines(fdr1$score, fdr1$qValue_SimpleFDR, col=5)

create fasta db from one or more fasta files

Description

create fasta db from one or more fasta files

Usage

create_fgcz_fasta_db(
  databasedirectory,
  useContaminants = loadContaminantsFGCZ2022(),
  revLab = "REV_",
  outputdir = NULL,
  make_summary = TRUE
)
create_fgcz_fasta_db(
  databasedirectory,
  useContaminants = loadContaminantsFGCZ2022(),
  revLab = "REV_",
  outputdir = NULL,
  make_summary = TRUE
)

Arguments

`databasedirectory`	directory with fasta files
`useContaminants`	contaminants to add
`revLab`	reverse label
`outputdir`	output directory

Value

list list(resDB, filepath , summary, mcall, dbname)

Examples

print("NO exmple since function also writes the fasta files")

print("NO exmple since function also writes the fasta files")

Create db with decoys and contaminants

Description

For more details and references see package vignette vignette("CreateDecoyDB", package = "prozor")

Usage

createDecoyDB(
  dbs,
  useContaminants = loadContaminantsFGCZ2022(),
  revLab = "REV_",
  annot = "zz|sourceOf|database"
)
createDecoyDB(
  dbs,
  useContaminants = loadContaminantsFGCZ2022(),
  revLab = "REV_",
  annot = "zz|sourceOf|database"
)

Arguments

`dbs`	a path to a fasta file or an array of files
`useContaminants`	list with contaminant sequences
`revLab`	label for reversed peptides (if NULL do not generate decoys)
`annot`	source of database

Value

list of SeqFastaAA entries

Examples

file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor")
cont <- loadContaminantsFGCZ2022()
rabbit <-readPeptideFasta(file)
tmp <- 2*(2*length(rabbit)+length(cont)) + 1
res <- createDecoyDB(c(file,file))
length(res)
stopifnot(length(res) == tmp)

res <- createDecoyDB(c(file,file), revLab=NULL)
stopifnot(length(res) == (2*length(rabbit)+length(cont) + 1))
res <- createDecoyDB(c(file,file), revLab=NULL, useContaminants = NULL)
stopifnot(length(res) == (2*length(rabbit) + 1) )

file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor")
cont <- loadContaminantsFGCZ2022()
rabbit <-readPeptideFasta(file)
tmp <- 2*(2*length(rabbit)+length(cont)) + 1
res <- createDecoyDB(c(file,file))
length(res)
stopifnot(length(res) == tmp)

res <- createDecoyDB(c(file,file), revLab=NULL)
stopifnot(length(res) == (2*length(rabbit)+length(cont) + 1))
res <- createDecoyDB(c(file,file), revLab=NULL, useContaminants = NULL)
stopifnot(length(res) == (2*length(rabbit) + 1) )

Data frame score and proteinID

Description

Data frame score and proteinID

given matrix (columns protein rows peptides), compute minimal protein set using greedy algorithm

Description

given matrix (columns protein rows peptides), compute minimal protein set using greedy algorithm

Usage

greedy_parsimony(pepprot)
greedy_parsimony(pepprot)

Arguments

pepprot

matrix as returned by prepareMatrix

Value

list of peptide protein assignment

Examples

#library(prozor)

data(protpepmetashort)
colnames(protpepmetashort)
dim(unique(protpepmetashort[,4]))
xx = prepareMatrix(protpepmetashort, peptideID = "peptideModSeq")
dim(xx)
stopifnot(dim(xx)[1] == dim(unique(protpepmetashort[,4]))[1])

es = greedy_parsimony(xx)
debug(prozor:::.greedy2)
stopifnot(length(unique(names(es))) == dim(unique(protpepmetashort[,4]))[1])

#library(prozor)

data(protpepmetashort)
colnames(protpepmetashort)
dim(unique(protpepmetashort[,4]))
xx = prepareMatrix(protpepmetashort, peptideID = "peptideModSeq")
dim(xx)
stopifnot(dim(xx)[1] == dim(unique(protpepmetashort[,4]))[1])

es = greedy_parsimony(xx)
debug(prozor:::.greedy2)
stopifnot(length(unique(names(es))) == dim(unique(protpepmetashort[,4]))[1])

converts result of greedy_parsimony function to a matrix with 3 columns - peptide - charge and protein

Description

converts result of greedy_parsimony function to a matrix with 3 columns - peptide - charge and protein

Usage

greedyRes2Matrix(res)
greedyRes2Matrix(res)

Arguments

res

result of function prozor::greedy_parsimony

Value

matrix of peptide protein assignments

load special prot

Description

load special prot

Usage

load_special_prot_2024(noHuman = FALSE)
load_special_prot_2024(noHuman = FALSE)

Arguments

noHuman

should human contaminants be excluded? default FALSE

Value

list with contaminant sequences

Examples

#library(prozor)
cont <- load_special_prot_2024()
length(cont)
contNH <- load_special_prot_2024(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences
#library(prozor)
cont <- load_special_prot_2024()
length(cont)
contNH <- load_special_prot_2024(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences

load universal contaminants

Description

These sequences are downloaded from here https://github.com/HaoGroup-ProtContLib/Protein-Contaminant-Libraries-for-DDA-and-DIA-Proteomics

Usage

load_universal_contaminants_2024(noHuman = FALSE)
load_universal_contaminants_2024(noHuman = FALSE)

Arguments

noHuman

should human contaminants be excluded? default FALSE

Value

list with contaminant sequences

Examples

#library(prozor)
cont <- load_universal_contaminants_2024()
length(cont)
contNH <- load_universal_contaminants_2024(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences
#library(prozor)
cont <- load_universal_contaminants_2024()
length(cont)
contNH <- load_universal_contaminants_2024(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences

load list of contaminant sequences FGCZ 2019

Description

load list of contaminant sequences FGCZ 2019

Usage

loadContaminantsFasta2019(noHuman = FALSE)
loadContaminantsFasta2019(noHuman = FALSE)

Arguments

noHuman

should human contaminants be excluded? default FALSE

Value

list with contaminant sequences

Examples

#library(prozor)
cont <- loadContaminantsFasta2019()
length(cont)
contNH <- loadContaminantsFasta2019()
length(contNH)
#example how to create a protein db with decoy sequences
#library(prozor)
cont <- loadContaminantsFasta2019()
length(cont)
contNH <- loadContaminantsFasta2019()
length(contNH)
#example how to create a protein db with decoy sequences

load list of contaminant sequences FGCZ 2021

Description

load list of contaminant sequences FGCZ 2021

Usage

loadContaminantsFasta2021(noHuman = FALSE)
loadContaminantsFasta2021(noHuman = FALSE)

Arguments

noHuman

should human contaminants be excluded? default FALSE

Value

list with contaminant sequences

Examples

#library(prozor)
cont <- loadContaminantsFasta2021()
length(cont)
contNH <- loadContaminantsFasta2021(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences
#library(prozor)
cont <- loadContaminantsFasta2021()
length(cont)
contNH <- loadContaminantsFasta2021(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences

load list of contaminant sequences FGCZ 2022

Description

load list of contaminant sequences FGCZ 2022

Usage

loadContaminantsFGCZ2022(noHuman = FALSE)
loadContaminantsFGCZ2022(noHuman = FALSE)

Arguments

noHuman

should human contaminants be excluded? default FALSE

Value

list with contaminant sequences

Examples

#library(prozor)
cont <- loadContaminantsFGCZ2022()
length(cont)
contNH <- loadContaminantsFGCZ2022(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences
#library(prozor)
cont <- loadContaminantsFGCZ2022()
length(cont)
contNH <- loadContaminantsFGCZ2022(noHuman = TRUE)
length(contNH)
#example how to create a protein db with decoy sequences

make db summary which includes number of sequences, and amino acid statistcis

Description

make db summary which includes number of sequences, and amino acid statistcis

Usage

make_fasta_summary(resDB, old = FALSE, as_string = TRUE)
make_fasta_summary(resDB, old = FALSE, as_string = TRUE)

Value

array of strings which should be passed to the cat function.

Examples

file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor")
xx <- prozor::readPeptideFasta(file)
cat(make_fasta_summary(xx))
rbenchmark::benchmark(make_fasta_summary(xx),replications = 10)
rbenchmark::benchmark(make_fasta_summary(xx, old = TRUE), replications = 10)
make_fasta_summary(xx, as_string = FALSE)
make_fasta_summary(xx, old =TRUE, as_string = FALSE)
file = system.file("extdata/fgcz_contaminants2022_20220405.fasta.gz",package="prozor")
xx <- prozor::readPeptideFasta(file)
cat(make_fasta_summary(xx))
rbenchmark::benchmark(make_fasta_summary(xx),replications = 10)
rbenchmark::benchmark(make_fasta_summary(xx, old = TRUE), replications = 10)
make_fasta_summary(xx, as_string = FALSE)
make_fasta_summary(xx, old =TRUE, as_string = FALSE)

make id for chain in format sp|P30443|1A01_HUMANs25

Description

make id for chain in format sp|P30443|1A01_HUMANs25

Usage

makeID(sequence, id, sp)
makeID(sequence, id, sp)

Arguments

`sequence`	- aa sequence as string
`id`	uniprot id id: sp\|P30443\|1A01_HUMAN
`sp`	start position of chain numeric

Value

string consisting of id,"s",sp

Examples

seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\
GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\
MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\
DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\
VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\
TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\
HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\
SSDRKGGSYTQAASSDSAQGSDVSLTACKV"
nam <-"sp|P30443|1A01_HUMAN"
sp <- 24
makeID(seq, nam, sp)
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\
GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\
MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\
DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\
VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\
TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\
HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\
SSDRKGGSYTQAASSDSAQGSDVSLTACKV"
nam <-"sp|P30443|1A01_HUMAN"
sp <- 24
makeID(seq, nam, sp)

make id for chain compatible with uniprot

Description

make id for chain compatible with uniprot

Usage

makeIDUnip(sequence, id, sp)
makeIDUnip(sequence, id, sp)

Arguments

`sequence`	- aa sequence as string
`id`	uniprot id id: sp\|P30443\|1A01_HUMAN
`sp`	start position of chain numeric

Value

string consisting of sp,"-", length of sequnce

Examples

seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\
GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\
MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\
DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\
VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\
TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\
HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\
SSDRKGGSYTQAASSDSAQGSDVSLTACKV"
nam <-"sp|P30443|1A01_HUMAN"
sp <- 24
makeIDUnip(seq, nam, sp)
seq <- "MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGR\
GEPRFIAVGYVDDTQFVRFDSDAASQKMEPRAPWIEQEGPEYWDQETRN\
MKAHSQTDRANLGTLRGYYNQSEDGSHTIQIMYGCDVGPDGRFLRGYRQ\
DAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAVHAAEQRRVYLEGRC\
VDGLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEI\
TLTWQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQ\
HEGLPKPLTLRWELSSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRK\
SSDRKGGSYTQAASSDSAQGSDVSLTACKV"
nam <-"sp|P30443|1A01_HUMAN"
sp <- 24
makeIDUnip(seq, nam, sp)

MS masses A dataset containing approx 150000 MS1 precursor masses

Description

MS masses A dataset containing approx 150000 MS1 precursor masses

compute the deviation from optimum: equal number of MS1 per bin

Description

compute the deviation from optimum: equal number of MS1 per bin

Usage

objectiveMS1Function(splits, data)
objectiveMS1Function(splits, data)

Arguments

`splits`	the new window boundaries
`data`	the data

Value

list with score1 - manhattan distance, score2 - euclidean distance, counts - observed counts, optimumN - optimum counts

Table containing peptide information

Description

Table containing peptide information

plot FDR

Description

For more details and references see package vignette vignette("TargetDecoyFDR_Example", package = "prozor")

Usage

plotFDR(data)
plotFDR(data)

Arguments

data

data returned by computeFDR function

Value

creates a plot

Examples

#library(prozor)
data(fdrSample)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)
fdr2 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
plotFDR(fdr1)
plotFDR(fdr2)
data<-fdr1

#library(prozor)
data(fdrSample)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)
fdr2 <- computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
plotFDR(fdr1)
plotFDR(fdr2)
data<-fdr1

Predict score given FDR

Description

For more details and references see package vignette vignette("TargetDecoyFDR_Example", package = "prozor")

Usage

predictScoreFDR(fdrObj, qValue = 1, method = "SimpleFDR")
predictScoreFDR(fdrObj, qValue = 1, method = "SimpleFDR")

Arguments

`fdrObj`	object generated by computeFDR
`qValue`	false discovery rate in percent, default 1 percent
`method`	either FPR or SimpleFDR, default is SimpleFDR

Value

score for a given FDR

Examples

data(fdrSample)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)

predictScoreFDR(fdr1,qValue=5)
fdr2<-computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
predictScoreFDR(fdr2,qValue=5)

data(fdrSample)
fdr1 <- computeFDRwithID(fdrSample$score, fdrSample$proteinID, larger_better = FALSE)

predictScoreFDR(fdr1,qValue=5)
fdr2<-computeFDRwithID(fdrSample$score2, fdrSample$proteinID, larger_better = TRUE)
predictScoreFDR(fdr2,qValue=5)

given table of peptide protein assigments generate matrix

Description

given table of peptide protein assigments generate matrix

Usage

prepareMatrix(
  data,
  proteinID = "proteinID",
  peptideID = "strippedSequence",
  weighting = NULL,
  sep = "|",
  use.last.ij = TRUE
)
prepareMatrix(
  data,
  proteinID = "proteinID",
  peptideID = "strippedSequence",
  weighting = NULL,
  sep = "|",
  use.last.ij = TRUE
)

Arguments

`data`	generated by annotatePeptides
`proteinID`	protein ID column
`peptideID`	peptide / precursor ID column
`weighting`	weight type to use. Options are "one" , "AA" - amino acids, "coverage" - coverage , "inverse" - inverse peptide frequencies
`sep`	separator for precursor (rownames)

Value

sparse matrix

Examples

#library(prozor)
data(protpepmetashort)
library(Matrix)
colnames(protpepmetashort)
head(protpepmetashort)
dim(protpepmetashort)
count = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" )
dim(count)
inverse = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" , weight = "inverse")
#aa = prepareMatrix(protpepmetashort,  peptideID = "peptideSeq" , weight = "AA")
#xx = prepareMatrix(protpepmetashort,  peptideID = "peptideSeq" , weight = "coverage")
image( as.matrix(count) )

corProt = cor( as.matrix(count) )
par(mfrow =c(1,2))
image(corProt)

#penalise peptides matching many proteins
corProtn = cor( as.matrix(inverse) )
image(corProtn)

#library(prozor)
data(protpepmetashort)
library(Matrix)
colnames(protpepmetashort)
head(protpepmetashort)
dim(protpepmetashort)
count = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" )
dim(count)
inverse = prepareMatrix( protpepmetashort, peptideID = "peptideSeq" , weight = "inverse")
#aa = prepareMatrix(protpepmetashort,  peptideID = "peptideSeq" , weight = "AA")
#xx = prepareMatrix(protpepmetashort,  peptideID = "peptideSeq" , weight = "coverage")
image( as.matrix(count) )

corProt = cor( as.matrix(count) )
par(mfrow =c(1,2))
image(corProt)

#penalise peptides matching many proteins
corProtn = cor( as.matrix(inverse) )
image(corProtn)

Small version of pepprot dataset to speed up computation

Description

Small version of pepprot dataset to speed up computation

Minimal Protein Set Explaining Peptides

Description

Determine minimal protein set explaining peptide spectrum matches. Utility functions for creating fasta amino acid databases with decoys and contaminants. Peptide false discovery rate estimation for target decoy search results on psm, precursor, peptide and protein level. Computing dynamic swath window sizes based on MS1 and MS2 signal distributions.

Author(s)

Maintainer: Witold Wolski [email protected] (ORCID)

Readjust windows so that boundaries in regions of few peaks.

Description

Readjust windows so that boundaries in regions of few peaks.

Usage

readjustWindows(wind, ms1data, digits = 1, maxbin = 15, plot = FALSE)
readjustWindows(wind, ms1data, digits = 1, maxbin = 15, plot = FALSE)

Arguments

`wind`	a data frame with columns from and to
`ms1data`	masses
`digits`	mass accuracy
`maxbin`	maximum number of bins
`plot`	diagnostic plots (default FALSE)

Value

data.frame of same format as wind but with improved start and end masses.

Examples

data(masses)
cdsw <- Cdsw(masses)
breaks <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE)
table <- cdsw$asTable()
dim(table)
head(table)

tmp <- readjustWindows(table, masses,maxbin=10)
data.frame(tmp)

data(masses)
cdsw <- Cdsw(masses)
breaks <- cdsw$sampling_breaks(maxwindow=100,plot=TRUE)
table <- cdsw$asTable()
dim(table)
head(table)

tmp <- readjustWindows(table, masses,maxbin=10)
data.frame(tmp)

wrapper setting the correct parameters seqinr::read.fasta for reading peptide sequences

Description

peptides which do not have protein assignment drop out

Usage

readPeptideFasta(file)
readPeptideFasta(file)

Arguments

file

- fasta file

Value

list with sequences

Examples

library(seqinr)

file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor")
fasta = readPeptideFasta(file)

library(seqinr)

file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor")
fasta = readPeptideFasta(file)

remove signal peptides from main chain

Description

remove signal peptides from main chain

Usage

removeSignalPeptide(db, signal, idfun = makeID)
removeSignalPeptide(db, signal, idfun = makeID)

Arguments

`db`	uniprot fasta database as list
`signal`	tab delimited file with signals
`idfun`	function to generate id's

Value

list with sequences with signal peptide removed

create rev sequences to fasta list

Description

peptides which do not have protein assignment drop out

Usage

reverseSeq(fasta, revLab = "REV_")
reverseSeq(fasta, revLab = "REV_")

Arguments

`fasta`	- an r list with SeqFastaAA
`revLab`	- how to label reverse sequences, default = REV_

Value

string with reversed sequence

Examples

library(seqinr)
#library(prozor)

file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz", package="prozor")
fasta = readPeptideFasta(file = file)
getAnnot(fasta[[1]])
x <- reverseSeq(fasta)

revseq <- reverseSeq(fasta ,revLab = "REV_")
stopifnot(length(revseq) == length(fasta))
stopifnot(grep("^REV_","REV_zz|ZZ_FGCZCont0000|")==1)

tmp <- list(as.SeqFastaAA(("DYKDDDDK"),name="Flag|FLAG|p2079",Annot=""))

reverseSeq(tmp)

library(seqinr)
#library(prozor)

file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz", package="prozor")
fasta = readPeptideFasta(file = file)
getAnnot(fasta[[1]])
x <- reverseSeq(fasta)

revseq <- reverseSeq(fasta ,revLab = "REV_")
stopifnot(length(revseq) == length(fasta))
stopifnot(grep("^REV_","REV_zz|ZZ_FGCZCont0000|")==1)

tmp <- list(as.SeqFastaAA(("DYKDDDDK"),name="Flag|FLAG|p2079",Annot=""))

reverseSeq(tmp)

write fasta lists into file

Description

peptides which do not have protein assignment drop out

Usage

writeFasta(file, ...)
writeFasta(file, ...)

Arguments

`file`	where to write
`...`	fasta list or single file

Value

writes a file.

Examples

#example how to create a protein db with decoy sequences
library(seqinr)
#library(prozor)
file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor")
fasta = readPeptideFasta(file = file)
revfasta <- reverseSeq(fasta)
decoyDB <- c(fasta,revfasta)
stopifnot(length(decoyDB) == 2 * length(fasta))
## Not run: 
writeFasta(decoyDB, file="test.fasta")

## End(Not run)
#example how to create a protein db with decoy sequences
library(seqinr)
#library(prozor)
file = system.file("extdata/fgcz_contaminants2021_20210929.fasta.gz",package = "prozor")
fasta = readPeptideFasta(file = file)
revfasta <- reverseSeq(fasta)
decoyDB <- c(fasta,revfasta)
stopifnot(length(decoyDB) == 2 * length(fasta))
## Not run: 
writeFasta(decoyDB, file="test.fasta")

## End(Not run)

Package 'prozor'

Help Index

annotate peptides using AhoCorasickTrie

Description

Usage

Arguments

Value

Examples

Annotate peptides with protein ids

Description

Usage

Arguments

Value

Examples

Data frame as produced by COMET-MS search engine

Description

Compute dynamic swath windows

Description

Arguments

Value

Fields

Methods

Examples

Compute FDR given a score

Description

Usage

Arguments

Value

Examples

Compute FDR given a score

Description

Usage

Arguments

Value

Examples

create fasta db from one or more fasta files

Description

Usage

Arguments

Value

Examples

Create db with decoys and contaminants

Description

Usage

Arguments

Value

Examples

Data frame score and proteinID

Description

given matrix (columns protein rows peptides), compute minimal protein set using greedy algorithm

Description

Usage

Arguments

Value

Examples

converts result of greedy_parsimony function to a matrix with 3 columns - peptide - charge and protein

Description

Usage

Arguments

Value

load special prot

Description

Usage

Arguments

Value

Examples

load universal contaminants

Description

Usage

Arguments

Value

Examples

load list of contaminant sequences FGCZ 2019

Description

Usage

Arguments

Value

Examples

load list of contaminant sequences FGCZ 2021

Description