Skip to content

Commit 39ad307

Browse files
authored
Merge pull request #24 from jmenglund/develop
Add the ability to manually set base frequences
2 parents e82c45b + afd9ee2 commit 39ad307

5 files changed

Lines changed: 88 additions & 44 deletions

File tree

CHANGELOG.rst

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,30 @@ All notable changes to this project will be documented in this file.
55
This project adheres to `Semantic Versioning <http://semver.org/>`_.
66

77

8+
v0.7.0 - 2019-07-11
9+
-------------------
10+
11+
* Added flags ``-f`` and ``--freqs`` to manually set base frequences.
12+
As a consequence, the flags ``-o`` and ``--out-format`` have to be
13+
used when specifying the output format.
14+
15+
`View commits <https://github.com/jmenglund/predsim/compare/v0.6.0...v0.7.0>`_
16+
17+
818
v0.6.0 - 2019-05-05
919
-------------------
1020

1121
Added
1222
~~~~~
1323

14-
* Ability to write used trees to a file with the ``--trees-file`` option.
24+
* Ability to write used trees to a file with the ``--trees-file`` flag.
1525
* Simple test data for various substitution models.
1626

1727

1828
Changed
1929
~~~~~~~
2030

21-
* Command-line option ``-o`` and ``--output-format`` replaced with ``-f`` and
31+
* Command-line flags ``-o`` and ``--output-format`` replaced with ``-f`` and
2232
``--format``, respectively.
2333
* The file with seed numbers may now contain more numbers than being used.
2434
* Content is written to output after completing each simulation (instead
@@ -66,8 +76,8 @@ v0.4.0 - 2019-03-09
6676
Added
6777
~~~~~
6878

69-
* The command-line options ``-o`` and ``--output-format`` now allow
70-
writing output to either the PHYLIP or the NEXUS format.
79+
* The command-line flags ``-o`` and ``--output-format`` now allow
80+
writing output to either Phylip or Nexus format.
7181

7282

7383
Changed

README.rst

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ predsim
44
|Build-Status| |Coverage-Status| |PyPI-Status| |License| |DOI-URI|
55

66
**predsim** is a simple command-line tool for simulating predictive
7-
datasets from `MrBayes <http://mrbayes.sourceforge.net>`_ output files.
7+
datasets from `MrBayes' <http://mrbayes.sourceforge.net>`_ output files.
88
Datasets can be simulated under the GTR+G+I substitution model or any nested
99
variant available in MrBayes (JC69, HKY85 etc.). The code is contained
1010
within a single module that can be imported using Python's import mechanism.
@@ -65,9 +65,9 @@ Usage
6565
.. code-block::
6666
6767
$ predsim --help
68-
usage: predsim [-h] [-V] [-l N] [-g N] [-s N] [-n N] [-f {nexus,phylip}]
69-
[-p FILE] [--seeds-file FILE] [--commands-file FILE]
70-
[--trees-file FILE]
68+
usage: predsim [-h] [-V] [-l N] [-f #A #C #G #T] [-g N] [-s N] [-n N]
69+
[-o {nexus,phylip}] [-p FILE] [--seeds-file FILE]
70+
[--commands-file FILE] [--trees-file FILE]
7171
pfile tfile
7272
7373
A command-line utility that reads posterior output of MrBayes and simulates
@@ -81,12 +81,15 @@ Usage
8181
-h, --help show this help message and exit
8282
-V, --version show program's version number and exit
8383
-l N, --length N sequence lenght (default: 1000)
84+
-f #A #C #G #T, --freqs #A #C #G #T
85+
base frequences (overrides any base frequences in
86+
MrBayes' output)
8487
-g N, --gamma-cats N number of gamma rate categories (default: continuous)
8588
-s N, --skip N number of records (trees) to skip at the beginning of
8689
the sample (default: 0)
8790
-n N, --num-records N
8891
number of records (trees) to use in the simulation
89-
-f {nexus,phylip}, --format {nexus,phylip}
92+
-o {nexus,phylip}, --out-format {nexus,phylip}
9093
output format (default: "nexus")
9194
-p FILE, --seqgen-path FILE
9295
path to a Seq-Gen executable (default: "seq-gen")
@@ -95,9 +98,10 @@ Usage
9598
--commands-file FILE path to output file with commands used by Seq-Gen
9699
--trees-file FILE path to output file with trees used by Seq-Gen
97100
98-
99-
* It is recommended that you use the ``--commands-file`` and the ``--trees-file``
100-
options to check the input given to Seq-Gen.
101+
* If base frequences are missing from MrBayes' output, these must be set manually
102+
with the ``-f`` (or ``--freqs``) flag.
103+
* It is recommended that you use the ``--commands-file`` and ``--trees-file``
104+
flags to check the input given to Seq-Gen.
101105

102106

103107
Running the tests

predsim.py

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
__author__ = 'Markus Englund'
2323
__license__ = 'MIT'
24-
__version__ = '0.6.0'
24+
__version__ = '0.7.0'
2525

2626

2727
def main(args=None):
@@ -41,7 +41,7 @@ def main(args=None):
4141

4242
result_iterator = iter_seqgen_results(
4343
simulation_input, seq_len=parser.length, gamma_cats=parser.gamma_cats,
44-
seqgen_path=parser.sg_filepath)
44+
basefreqs=parser.basefreqs, seqgen_path=parser.sg_filepath)
4545

4646
if parser.out_format == 'nexus':
4747
schema_kwargs = {'schema': 'nexus', 'simple': False}
@@ -75,6 +75,12 @@ def parse_args(args):
7575
parser.add_argument(
7676
'-l', '--length', action='store', default=1000, type=int,
7777
help='sequence lenght (default: 1000)', metavar='N', dest='length')
78+
parser.add_argument(
79+
'-f', '--freqs', action='store', type=float, nargs=4,
80+
help=(
81+
'base frequences (overrides any base frequences '
82+
'in MrBayes\' output)'),
83+
metavar=('#A', '#C', '#G', '#T'), dest='basefreqs')
7884
parser.add_argument(
7985
'-g', '--gamma-cats', action='store', type=int,
8086
help='number of gamma rate categories (default: continuous)',
@@ -88,7 +94,7 @@ def parse_args(args):
8894
help='number of records (trees) to use in the simulation',
8995
metavar='N', dest='num_records')
9096
parser.add_argument(
91-
'-f', '--format', default='nexus', choices=['nexus', 'phylip'],
97+
'-o', '--out-format', default='nexus', choices=['nexus', 'phylip'],
9298
help='output format (default: "nexus")', dest='out_format')
9399
parser.add_argument(
94100
'-p', '--seqgen-path', default='seq-gen', type=str,
@@ -111,7 +117,7 @@ def parse_args(args):
111117
help='path to a MrBayes p-file', metavar='pfile')
112118
parser.add_argument(
113119
'tfile_path', action=StoreExpandedPath, type=is_file,
114-
help='path to a MrBayes t-file', metavar='tfile', )
120+
help='path to a MrBayes t-file', metavar='tfile')
115121

116122
return parser.parse_args(args)
117123

@@ -200,35 +206,41 @@ def kappa_to_titv(kappa, piA, piC, piG, piT):
200206
return titv
201207

202208

203-
def get_seqgen_params(mrbayes_params):
209+
def get_seqgen_params(mrbayes_params, basefreqs=None):
204210
"""
205211
Adapt MrBayes parameter values for use with Seq-Gen.
206212
207213
Paramters
208214
---------
209-
mrbayes_prams : dict
215+
mrbayes_params : dict
210216
Parameter values from a single row in a MrBayes p-file.
217+
basefreqs : list of floats
218+
Frequences for the four nucleotides A, C, G, and T to use
219+
if missing from MrBayes output.
211220
212221
Returns
213222
-------
214223
seqgen_params : dict
215224
"""
216-
seqgen_params = {}
217-
try:
218-
seqgen_params['state_freqs'] = (
219-
str(mrbayes_params['pi(A)']) + ',' +
220-
str(mrbayes_params['pi(C)']) + ',' +
221-
str(mrbayes_params['pi(G)']) + ',' +
222-
str(mrbayes_params['pi(T)']))
223-
except KeyError:
224-
seqgen_params['state_freqs'] = '0.25,0.25,0.25,0.25'
225+
226+
if basefreqs is None:
227+
try:
228+
basefreqs = [
229+
float(mrbayes_params['pi(A)']),
230+
float(mrbayes_params['pi(C)']),
231+
float(mrbayes_params['pi(G)']),
232+
float(mrbayes_params['pi(T)'])]
233+
except KeyError as exc:
234+
msg = (
235+
'Base frequences must be provided since they '
236+
'are not present in MrBayes\' output.')
237+
raise KeyError(msg) from exc
238+
239+
seqgen_params = {'state_freqs': ','.join([str(v) for v in basefreqs])}
240+
225241
try:
226242
seqgen_params['ti_tv'] = kappa_to_titv(
227-
float(mrbayes_params['kappa']),
228-
float(mrbayes_params['pi(A)']),
229-
float(mrbayes_params['pi(C)']),
230-
float(mrbayes_params['pi(G)']),
231-
float(mrbayes_params['pi(T)']))
243+
float(mrbayes_params['kappa']), *basefreqs)
232244
except KeyError:
233245
pass
234246
try:
@@ -267,11 +279,11 @@ def combine_simulation_input(tree_list, p_dicts, rng_seeds=None):
267279

268280

269281
def iter_seqgen_results(
270-
simulation_input, seq_len=1000, gamma_cats=None,
282+
simulation_input, seq_len=1000, gamma_cats=None, basefreqs=None,
271283
seqgen_path='seq-gen'):
272284
"""Iterate over multiple simulations."""
273285
for tree, p_dict, rng_seed in simulation_input:
274-
seqgen_params = get_seqgen_params(p_dict)
286+
seqgen_params = get_seqgen_params(p_dict, basefreqs=basefreqs)
275287
result = simulate_matrix(
276288
tree, seq_len=seq_len, rng_seed=rng_seed,
277289
seqgen_path=seqgen_path, **seqgen_params)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name='predsim',
11-
version='0.6.0',
11+
version='0.7.0',
1212
description=(
1313
'Command-line tool for simulating predictive datasets '
1414
'from MrBayes\' output.'),

test_predsim.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ def test_equal_basefreqs(self):
118118
assert get_seqgen_params(self.d1) == self.d2
119119

120120
def test_missing_basefreq(self):
121-
assert get_seqgen_params(self.d3) == self.d2
121+
with pytest.raises(KeyError):
122+
get_seqgen_params(self.d3)
122123

123124

124125
@seqgen_required
@@ -253,14 +254,17 @@ def test_parser(self):
253254
seeds_filepath = get_testfile_path('seeds_1.txt')
254255

255256
parser = parse_args([
256-
'-l', '2', '-s', '1', '-g', '5', '-n', '1', '-f', 'phylip',
257-
'-p', 'sg', '--seeds-file', seeds_filepath,
257+
'-l', '2', '-s', '1',
258+
'-g', '5', '--freqs', '0.25', '0.25', '0.25', '0.25',
259+
'-n', '1', '--out-format', 'phylip', '-p', 'sg',
260+
'--seeds-file', seeds_filepath,
258261
'--commands-file', self.commands_fo.name,
259262
'--trees-file', self.trees_fo.name,
260263
pfile_path, tfile_path])
261264
assert parser.length == 2
262265
assert parser.skip == 1
263266
assert parser.gamma_cats == 5
267+
assert parser.basefreqs == [0.25, 0.25, 0.25, 0.25]
264268
assert parser.num_records == 1
265269
assert parser.out_format == 'phylip'
266270
assert parser.sg_filepath == 'sg'
@@ -338,7 +342,7 @@ def test_hky_outfile(self, outfile_arg):
338342

339343
def test_hky_phylip(self, capsys):
340344
main([
341-
'-l', '2', '-s', '2', '-f', 'phylip',
345+
'-l', '2', '-s', '2', '--out-format', 'phylip',
342346
'--seeds-file', get_testfile_path('seeds_1.txt'),
343347
get_testfile_path('hky.p'),
344348
get_testfile_path('hky.t')])
@@ -351,16 +355,30 @@ def test_hky_phylip(self, capsys):
351355
'jc',
352356
'jc_gamma',
353357
'jc_propinvar',
354-
'jc_invgamma',
355-
'gtr'])
356-
def test_subst_model(self, capsys, model_string):
358+
'jc_invgamma'])
359+
def test_jc(self, capsys, model_string):
357360
pfile_path, tfile_path, expected_path = get_model_testfile_paths(
358361
model_string, num_records=1, ext='.nex')
359362
with open(expected_path, 'r') as fo:
360363
expected_out = fo.read()
361364
main([
362-
'-l', '2', '-n', '1', '--seeds-file',
363-
get_testfile_path('seeds_1.txt'), pfile_path, tfile_path])
365+
'-l', '2', '-n', '1',
366+
'--freqs', '0.25', '0.25', '0.25', '0.25',
367+
'--seeds-file', get_testfile_path('seeds_1.txt'),
368+
pfile_path, tfile_path])
369+
out, err = capsys.readouterr()
370+
assert out == expected_out
371+
assert err == ''
372+
373+
def test_gtr(self, capsys):
374+
pfile_path, tfile_path, expected_path = get_model_testfile_paths(
375+
'gtr', num_records=1, ext='.nex')
376+
with open(expected_path, 'r') as fo:
377+
expected_out = fo.read()
378+
main([
379+
'-l', '2', '-n', '1',
380+
'--seeds-file', get_testfile_path('seeds_1.txt'),
381+
pfile_path, tfile_path])
364382
out, err = capsys.readouterr()
365383
assert out == expected_out
366384
assert err == ''

0 commit comments

Comments
 (0)