50K simulations results

50K simulations results#

Let’s test the 50K simulations results:

from pathlib import Path
from collections import defaultdict

import tskit
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tskitetude import get_project_dir

Let’s declare some stuff

repeats = 5
breed_size = [1, 2, 5, 10, 15, 20]
simulation = "50K_simulations"
reference_prefix = "results-reference"
compara_prefix = "results-compara"
chromosome = "26"
def collect_stats(results_prefix, simulation, chromosome, breed_size, repeats):

    data = defaultdict(list)
    prefix = get_project_dir() / results_prefix / simulation

    for i in breed_size:

        for j in range(repeats):
            run = f"{i}_breeds-{j}-50K"
            filename = prefix / run / "tsinfer" / f"SMARTER-OA-OAR3-forward-0.4.10-50K.focal.{chromosome}.trees"

            if not  filename.exists():
                print(f"File {filename} does not exist")
                continue

            ts = tskit.load(filename)

            # track some stats for tree object
            data["breeds"] += [i]
            data["repeat"] += [j]
            data["filename"] += [filename]
            data["sites"] += [ts.num_sites]
            data["trees"] += [ts.num_trees]
            data["edges"] += [ts.num_edges]
            data["individuals"] += [ts.num_individuals]
            data["samples"] += [ts.num_samples]
            data["mutations"] += [ts.num_mutations]
            data["nodes"] += [ts.num_nodes]
            data["populations"] += [ts.num_populations]

    return pd.DataFrame(data)

Read data from reference simulations:

reference = collect_stats(reference_prefix, simulation, chromosome, breed_size, repeats)
reference.head()
breeds repeat filename sites trees edges individuals samples mutations nodes populations
0 1 0 /home/core/TSKITetude/results-reference/50K_si... 792 752 7717 24 48 792 1355 1
1 1 1 /home/core/TSKITetude/results-reference/50K_si... 773 732 7525 24 48 773 1309 1
2 1 2 /home/core/TSKITetude/results-reference/50K_si... 673 616 5974 23 46 673 1102 1
3 1 3 /home/core/TSKITetude/results-reference/50K_si... 766 697 6184 24 48 766 1168 1
4 1 4 /home/core/TSKITetude/results-reference/50K_si... 573 438 3153 21 42 573 707 1

Collect stats for compara:

compara = collect_stats(compara_prefix, simulation, chromosome, breed_size, repeats)
compara.head()
breeds repeat filename sites trees edges individuals samples mutations nodes populations
0 1 0 /home/core/TSKITetude/results-compara/50K_simu... 791 557 5667 24 48 1865 1006 1
1 1 1 /home/core/TSKITetude/results-compara/50K_simu... 774 542 5664 24 48 1751 979 1
2 1 2 /home/core/TSKITetude/results-compara/50K_simu... 675 476 4577 23 46 1476 846 1
3 1 3 /home/core/TSKITetude/results-compara/50K_simu... 761 528 4822 24 48 1412 918 1
4 1 4 /home/core/TSKITetude/results-compara/50K_simu... 555 333 2443 21 42 786 553 1
sns.pairplot(reference[reference.columns.difference(['repeat'])], hue="breeds", palette="viridis")
<seaborn.axisgrid.PairGrid at 0x7f66381791c0>
../_images/66ae604ee67d08e80901836a17a3f7a34d345d5af0eecf61d16689f022ff462f.png
sns.pairplot(compara[compara.columns.difference(["repeat"])], hue="breeds", palette="viridis")
<seaborn.axisgrid.PairGrid at 0x7f65e1f721b0>
../_images/dc815eac733d75ba10c7867ceaa21e410813615a2b0405fc4a4b5737482e460f.png

It seems that the number of mutation is higher using the compara approach instead of the reference approach. Let’s try to model the number of mutations relying on the numer or samples. Try to fit a linear model using sns.regplot:

sns.regplot(data=compara, x="samples", y="mutations", ci=95)
<Axes: xlabel='samples', ylabel='mutations'>
../_images/6ebce23079084440dbfa0fb97eb1f9a990aaba00c5ef00b19cde1784b7968a10.png

Try to fit a 2nd degree polynomial model using np.polyfit:

# fit a polynomial to the data
coeffs = np.polyfit(compara['samples'], compara['mutations'], 2)
poly = np.poly1d(coeffs)

# generate values for x and y
x_vals = np.linspace(compara['samples'].min(), compara['samples'].max(), 100)
y_vals = poly(x_vals)

# calculate residuals
y_fit = poly(compara['samples'])
residuals = compara['mutations'] - y_fit
std_residuals = np.std(residuals)

# calculate R^2
ss_res = np.sum(residuals**2)
ss_tot = np.sum((compara['mutations'] - np.mean(compara['mutations']))**2)
r2 = 1 - (ss_res / ss_tot)

# plot the data
plt.figure(figsize=(10, 6))
plt.scatter(compara['samples'], compara['mutations'], label='Data')
plt.plot(x_vals, y_vals, color='red', label='fitted polynomial')
plt.fill_between(x_vals, y_vals - std_residuals, y_vals + std_residuals, color='red', alpha=0.2, label='Error')

# add formula to the plot
formula = f'$y = {coeffs[0]:.2f}x^2 + {coeffs[1]:.2f}x + {coeffs[2]:.2f}$'
r2_text = f'$R^2 = {r2:.2f}$'
plt.text(0.05, 0.95, formula, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
plt.text(0.05, 0.90, r2_text, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')

# Add labels and title
plt.xlabel('samples')
plt.ylabel('mutations')
plt.title('Samples Vs Mutations')
plt.legend(loc='lower right')

# Show the plot
plt.show()
../_images/f968e350b12c6f06863ac23a2e498ae1f91ce2b8165564d8d6b5b09887527816.png