From 5a5151e4e1b82209bd774ae116704a0ea2c2c4bd Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 26 Jan 2019 15:11:52 +0100 Subject: [PATCH] allow weights in sampling --- CHANGELOG.md | 3 +++ REQUIRE | 1 + src/MotifSequenceGenerator.jl | 35 ++++++++++++++++++----------------- test/float_length_tests.jl | 10 ++++++++++ test/integer_length_tests.jl | 10 ++++++++++ 5 files changed, 42 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ba835f..7e41efe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,5 +7,8 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html # master work in progress changes are contained in this section. +# v0.2.0 +Keyword argument `weights` allows you to choose weights for the motifs that are used in the initial sequence. + # v0.1.0 - Initial Release Changelog is kept with respect to this release. diff --git a/REQUIRE b/REQUIRE index aa51726..1dc36fe 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,2 +1,3 @@ julia 0.7.0-beta2 Combinatorics 0.6.0 +StatsBase 0.23.0 diff --git a/src/MotifSequenceGenerator.jl b/src/MotifSequenceGenerator.jl index 0764b66..66d6eac 100644 --- a/src/MotifSequenceGenerator.jl +++ b/src/MotifSequenceGenerator.jl @@ -6,7 +6,7 @@ All main functionality is given by the function [`random_sequence`](@ref). """ module MotifSequenceGenerator -using Combinatorics, Random +using Combinatorics, Random, StatsBase export random_sequence, all_possible_sums @@ -23,11 +23,12 @@ Base.showerror(io::IO, e::DeadEndMotifs) = print(io, The algorithm works as follows: First a random sequence of motifs is created, so that it has length of `q - δq ≤ ℓ ≤ q - δq + maximum(motiflengths)`. The possible tries of random sequences is set by the `tries` keyword (default `5`). +The sequence is optionally sampled given a probability vector. -For each random try, it is first check whether the sequence is already correct. +For each random try, it is first checked whether the sequence is already correct. If not, the last entry of the sequence is dropped. Then, since the sequence is now already smaller than `q`, all possible sums of `summands` out of the motif pool -are checked. If some combination of `summands` sums to exactly the difference, +are checked. If some combination of `summands` sums to the difference, they are added to the sequence. For multiple satisfactory combinations, a random one is picked. @@ -50,7 +51,9 @@ a proper one, an error is thrown. Create a random sequence of motifs of type `M`, under the constraint that the sequence has "length" `ℓ` **exactly** within `q - δq ≤ ℓ ≤ q + δq`. Return the sequence itself as well as the -sequence of indices of `motifs` used to create it. +sequence of indices of `motifs` used to create it. A vector of probabilities `weights` +can be given as a keyword argument, which then dictates the sampling probability +for each entry of `motifs` for the initial sequence created. "length" here means an abstracted length defined by the struct `M`, based on the `limits` and `translate` functions. @@ -64,7 +67,7 @@ It does **not** refer to the amount of elements! motif which is translated by `t` (either negative or positive), with respect to the same units as `q`. -## Keywords +## Other Keywords Please see the source code (use `@which`) for a full description of the algorithm. * `tries = 5` : Up to how many initial random sequences are accepted. @@ -74,7 +77,10 @@ Please see the source code (use `@which`) for a full description of the algorith """ function random_sequence(motifs::Vector{M}, q, limits, translate, δq = 0; - tries = 5, summands = 3, tailcut = 2) where {M} + tries = 5, summands = 3, tailcut = 2, + weights = ones(length(motifs))) where {M} + + ws = _toweight(weights) idxs = 1:length(motifs) motifs0, motiflens = _motifs_at_origin(motifs, limits, translate) @@ -94,7 +100,7 @@ function random_sequence(motifs::Vector{M}, q, while worked == false count > tries && throw(DeadEndMotifs(tries, summands, tailcut)) - seq, seq_length = _random_sequence_try(motiflens, q, δq) + seq, seq_length = _random_sequence_try(motiflens, q, δq, ws) worked = _complete_sequence!(seq, motiflens, q, δq, summands, tailcut) @@ -104,6 +110,8 @@ function random_sequence(motifs::Vector{M}, q, return _instantiate_sequence(motifs0, motiflens, seq, translate), seq end +_toweight(a) = (s = sum(a); ProbabilityWeights(a./s, 1)) + """ _motifs_at_origin(motifs, limits, translate) -> (motifs0, motiflens) Bring all motifs to the origin and compute the motif lengths. @@ -121,15 +129,15 @@ function _motifs_at_origin(motifs::Vector{M}, limits, translate) where M end """ - _random_sequence_try(motiflens, q) -> seq, seq_length + _random_sequence_try(motiflens, q, δq [, ws]) -> seq, seq_length Return a random sequence of motif indices so that the total sequence is *guaranteed* to have total length of `q - δq ≤ ℓ ≤ q - δq + maximum(motiflens)`. """ -function _random_sequence_try(motiflens, q, δq) +function _random_sequence_try(motiflens, q, δq, ws = defaultweights(motiflens)) seq = Int[]; seq_length = 0; idxs = 1:length(motiflens) while seq_length < q - δq - i = rand(idxs) + i = sample(idxs, ws) push!(seq, i) seq_length += motiflens[i] end @@ -174,13 +182,6 @@ function _complete_sequence_remainder!(seq, motiflens, q, δq, summands, tailcut pop!(seq) isempty(seq) && return false - # I Think the following if is unecessary?... - # if q - δq - sum(motiflens[k] for k in seq) < 0 - # ok = _complete_sequence_extra!(seq, motiflens, q, δq) - # isempty(seq) && return false - # ok && return true - # end - # At this point ℓ is guaranteed less than q - δq remainder = q - δq - sum(motiflens[k] for k in seq) @assert remainder > 0 diff --git a/test/float_length_tests.jl b/test/float_length_tests.jl index 3924608..4ada78e 100644 --- a/test/float_length_tests.jl +++ b/test/float_length_tests.jl @@ -35,6 +35,16 @@ end end end +@testset "Float Length, Weights, δq=$(δq)" for δq in [1.0, 2.0] + weights = rand(1:5, N) + for j in 1:N + r, s = random_sequence(shouts, q, shoutlimits, shouttranslate, δq; + weights = weights) + ℓ = shoutlens(r) + @test q - δq ≤ ℓ ≤ q + δq + end +end + using MotifSequenceGenerator: DeadEndMotifs @test_throws ArgumentError random_sequence(shouts, q, shoutlimits, shouttranslate, 0.0) @test_throws DeadEndMotifs random_sequence(shouts, q, shoutlimits, shouttranslate, 0.000001) diff --git a/test/integer_length_tests.jl b/test/integer_length_tests.jl index 3a52764..40d6e4c 100644 --- a/test/integer_length_tests.jl +++ b/test/integer_length_tests.jl @@ -34,6 +34,16 @@ end end end +@testset "Integer Length, Weights, δq=$(δq)" for δq in [0, 2] + weights = rand(1:5, N) + for j in 1:N + r, s = random_sequence(shouts, q, shoutlimits, shouttranslate, δq; + tries = 10, weights = weights) + ℓ = shoutlens(r) + @test q - δq ≤ ℓ ≤ q + δq + end +end + using MotifSequenceGenerator: DeadEndMotifs @test_throws DeadEndMotifs random_sequence(shouts, 7, shoutlimits, shouttranslate, 0)