Population statistics

Population statistics#

try to do some statistics on smarter samples

import tskit
import json
import pandas as pd
import numpy as np

from lets_plot import *
LetsPlot.setup_html()

from collections import Counter
from tskitetude import get_project_dir

Determine chromosome and file to open

chromosome = 1
tsFile = get_project_dir() / f"results-compara/50K_simulations/10_breeds-0-50K/tsinfer/SMARTER-OA-OAR3-forward-0.4.10-50K.focal.{chromosome}.trees"
ts = tskit.load(tsFile)
tsPos = [x.position for x in ts.sites()]
# Prepare some lists
breeds = list(set([json.loads(ts.population(ts.node(u).population).metadata)['breed'] for u in ts.samples()]))

sample_nodes = [ts.node(n) for n in ts.samples()]
samples_listed_by_breed_dict = { pop: [s.id for s in sample_nodes if json.loads(ts.population(s.population).metadata)['breed'] == pop] for pop in breeds}

samples_listed_by_breed = [ [s.id for s in sample_nodes if json.loads(ts.population(s.population).metadata)['breed'] == pop] for pop in breeds]
num_populations = ts.num_populations
breedPairs = [(x, y) for x in range(num_populations) for y in range(num_populations) if x < y]

windows = np.linspace(0, int(ts.sequence_length), num= int(ts.sequence_length) // 1_000)
tmrca = ts.divergence(sample_sets=samples_listed_by_breed, indexes=breedPairs, windows=windows, mode='branch') // 2
gnn = ts.genealogical_nearest_neighbours(
    ts.samples(), samples_listed_by_breed
)

cols = {breed: gnn[:, u] for u, breed in enumerate(breeds)}
cols["breed"] = [json.loads(ts.population(ts.node(u).population).metadata)["breed"] for u in ts.samples()]
GnnDF = pd.DataFrame(cols)
GnnDF.to_csv(f"GnnDF_chr{chromosome}.csv")

Tajima by breed

popsTajima = pd.DataFrame()

for breed in breeds:
    tajima = ts.Tajimas_D(sample_sets=samples_listed_by_breed_dict[breed])
    tmp = pd.DataFrame({"TajimaD": [tajima], "Breed": breed})
    popsTajima = pd.concat([popsTajima, tmp])

popsTajima.to_csv("Tajima_" + str(chromosome) + ".csv", index = None)

Window Tajima by breed:

popsTajima = pd.DataFrame()

for breed in breeds:
    windowTajima = ts.Tajimas_D(sample_sets=samples_listed_by_breed_dict[breed], windows=windows)
    tmp = pd.DataFrame({"Breakpoint": windows[:-1], "TajimaD": list(windowTajima), "Breed": breed})
    popsTajima = pd.concat([popsTajima, tmp])

popsTajima.to_csv("WindowedTajima_" + str(chromosome) + ".csv", index = None)

DIversity by breed

popsDiversity = pd.DataFrame()
for breed in breeds:
    diversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed])
    tmp = pd.DataFrame({"Diversity": [diversity], "Breed": breed})
    popsDiversity = pd.concat([popsDiversity, tmp])

popsDiversity.to_csv("Diversity_" +str(chromosome) + ".csv", index = None)
popsDiversity = pd.DataFrame()
for breed in breeds:
    diversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed])
    tmp = pd.DataFrame({"Diversity": [diversity], "Breed": breed})
    popsDiversity = pd.concat([popsDiversity, tmp])

popsDiversity.to_csv("Diversity_" +str(chromosome) + ".csv", index = None)

Window diversity by breed

popsDiversity = pd.DataFrame()
for breed in breeds:
    windowDiversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed], windows=windows)
    tmp = pd.DataFrame({"Breakpoint": windows[:-1], "Diversity": list(windowDiversity), "Breed": breed})
    popsDiversity = pd.concat([popsDiversity, tmp])

popsDiversity.to_csv("WindowedDiversity_" +str(chromosome) + ".csv", index = None)

Fst by breed pairs

popsFst = pd.DataFrame()

for breed1, breed2 in breedPairs:
    breedName1 = breeds[breed1]
    breedName2 = breeds[breed2]
    windowFst = ts.Fst(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]])
    tmp = pd.DataFrame({"Fst": [windowFst], "Breed1": breedName1, "Breed2": breedName2})
    popsFst = pd.concat([popsFst, tmp])

popsFst.to_csv("WindowedFst_" + str(chromosome) + ".csv", index = None)
# Divergence by breed pairs
popsDivergence = pd.DataFrame()
for breed1, breed2 in breedPairs:
    breedName1 = breeds[breed1]
    breedName2 = breeds[breed2]
    divergence = ts.divergence(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]])
    tmp = pd.DataFrame({"Divergence": [divergence], "Breed1": breedName1, "Breed2": breedName2})
    popsDivergence = pd.concat([popsDivergence, tmp])

popsDivergence.to_csv("Divergence_" + str(chromosome) + ".csv", index = None)
# Window divergence by breed pairs
popsDivergence = pd.DataFrame()
for breed1, breed2 in breedPairs:
    breedName1 = breeds[breed1]
    breedName2 = breeds[breed2]
    windowDivergence = ts.divergence(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]], windows=windows)
    tmp = pd.DataFrame({"Breakpoint": windows[:-1], "Divergence": list(windowDivergence), "Breed1": breedName1, "Breed2": breedName2})
    popsDivergence = pd.concat([popsDivergence, tmp])

popsDivergence.to_csv("WindowedDivergence_" + str(chromosome) + ".csv", index = None)
breeds
['MAS', 'IDF', 'SWA', 'LLW', 'SUM', 'LAT', 'SAA', 'ALT', 'IAW', 'CHR']