50K simulations results

50K simulations results#

Let’s test the 50K simulations results:

from pathlib import Path
from collections import defaultdict

import tskit
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tskitetude import get_project_dir

Let’s declare some stuff

repeats = 5
breed_size = [1, 2, 5, 10, 15, 20]
simulation = "50K_simulations"
reference_prefix = "results-reference"
compara_prefix = "results-compara"
chromosome = "26"

def collect_stats(results_prefix, simulation, chromosome, breed_size, repeats):

    data = defaultdict(list)
    prefix = get_project_dir() / results_prefix / simulation

    for i in breed_size:

        for j in range(repeats):
            run = f"{i}_breeds-{j}-50K"
            filename = prefix / run / "tsinfer" / f"SMARTER-OA-OAR3-forward-0.4.10-50K.focal.{chromosome}.trees"

            if not  filename.exists():
                print(f"File {filename} does not exist")
                continue

            ts = tskit.load(filename)

            # track some stats for tree object
            data["breeds"] += [i]
            data["repeat"] += [j]
            data["filename"] += [filename]
            data["sites"] += [ts.num_sites]
            data["trees"] += [ts.num_trees]
            data["edges"] += [ts.num_edges]
            data["individuals"] += [ts.num_individuals]
            data["samples"] += [ts.num_samples]
            data["mutations"] += [ts.num_mutations]
            data["nodes"] += [ts.num_nodes]
            data["populations"] += [ts.num_populations]

    return pd.DataFrame(data)

Read data from reference simulations:

reference = collect_stats(reference_prefix, simulation, chromosome, breed_size, repeats)
reference.head()

	breeds	repeat	filename	sites	trees	edges	individuals	samples	mutations	nodes	populations
0	1	0	/home/core/TSKITetude/results-reference/50K_si...	792	752	7717	24	48	792	1355	1
1	1	1	/home/core/TSKITetude/results-reference/50K_si...	773	732	7525	24	48	773	1309	1
2	1	2	/home/core/TSKITetude/results-reference/50K_si...	673	616	5974	23	46	673	1102	1
3	1	3	/home/core/TSKITetude/results-reference/50K_si...	766	697	6184	24	48	766	1168	1
4	1	4	/home/core/TSKITetude/results-reference/50K_si...	573	438	3153	21	42	573	707	1

Collect stats for compara:

compara = collect_stats(compara_prefix, simulation, chromosome, breed_size, repeats)
compara.head()

	breeds	repeat	filename	sites	trees	edges	individuals	samples	mutations	nodes	populations
0	1	0	/home/core/TSKITetude/results-compara/50K_simu...	791	557	5667	24	48	1865	1006	1
1	1	1	/home/core/TSKITetude/results-compara/50K_simu...	774	542	5664	24	48	1751	979	1
2	1	2	/home/core/TSKITetude/results-compara/50K_simu...	675	476	4577	23	46	1476	846	1
3	1	3	/home/core/TSKITetude/results-compara/50K_simu...	761	528	4822	24	48	1412	918	1
4	1	4	/home/core/TSKITetude/results-compara/50K_simu...	555	333	2443	21	42	786	553	1

sns.pairplot(reference[reference.columns.difference(['repeat'])], hue="breeds", palette="viridis")

<seaborn.axisgrid.PairGrid at 0x7f66381791c0>

../_images/66ae604ee67d08e80901836a17a3f7a34d345d5af0eecf61d16689f022ff462f.png

sns.pairplot(compara[compara.columns.difference(["repeat"])], hue="breeds", palette="viridis")

<seaborn.axisgrid.PairGrid at 0x7f65e1f721b0>

../_images/dc815eac733d75ba10c7867ceaa21e410813615a2b0405fc4a4b5737482e460f.png

It seems that the number of mutation is higher using the compara approach instead of the reference approach. Let’s try to model the number of mutations relying on the numer or samples. Try to fit a linear model using sns.regplot:

sns.regplot(data=compara, x="samples", y="mutations", ci=95)

<Axes: xlabel='samples', ylabel='mutations'>

../_images/6ebce23079084440dbfa0fb97eb1f9a990aaba00c5ef00b19cde1784b7968a10.png

Try to fit a 2nd degree polynomial model using np.polyfit:

# fit a polynomial to the data
coeffs = np.polyfit(compara['samples'], compara['mutations'], 2)
poly = np.poly1d(coeffs)

# generate values for x and y
x_vals = np.linspace(compara['samples'].min(), compara['samples'].max(), 100)
y_vals = poly(x_vals)

# calculate residuals
y_fit = poly(compara['samples'])
residuals = compara['mutations'] - y_fit
std_residuals = np.std(residuals)

# calculate R^2
ss_res = np.sum(residuals**2)
ss_tot = np.sum((compara['mutations'] - np.mean(compara['mutations']))**2)
r2 = 1 - (ss_res / ss_tot)

# plot the data
plt.figure(figsize=(10, 6))
plt.scatter(compara['samples'], compara['mutations'], label='Data')
plt.plot(x_vals, y_vals, color='red', label='fitted polynomial')
plt.fill_between(x_vals, y_vals - std_residuals, y_vals + std_residuals, color='red', alpha=0.2, label='Error')

# add formula to the plot
formula = f'$y = {coeffs[0]:.2f}x^2 + {coeffs[1]:.2f}x + {coeffs[2]:.2f}$'
r2_text = f'$R^2 = {r2:.2f}$'
plt.text(0.05, 0.95, formula, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
plt.text(0.05, 0.90, r2_text, transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')

# Add labels and title
plt.xlabel('samples')
plt.ylabel('mutations')
plt.title('Samples Vs Mutations')
plt.legend(loc='lower right')

# Show the plot
plt.show()

../_images/f968e350b12c6f06863ac23a2e498ae1f91ce2b8165564d8d6b5b09887527816.png