Population statistics#
try to do some statistics on smarter samples
import tskit
import json
import pandas as pd
import numpy as np
from lets_plot import *
LetsPlot.setup_html()
from collections import Counter
from tskitetude import get_project_dir
Determine chromosome and file to open
chromosome = 1
tsFile = get_project_dir() / f"results-compara/50K_simulations/10_breeds-0-50K/tsinfer/SMARTER-OA-OAR3-forward-0.4.10-50K.focal.{chromosome}.trees"
ts = tskit.load(tsFile)
tsPos = [x.position for x in ts.sites()]
# Prepare some lists
breeds = list(set([json.loads(ts.population(ts.node(u).population).metadata)['breed'] for u in ts.samples()]))
sample_nodes = [ts.node(n) for n in ts.samples()]
samples_listed_by_breed_dict = { pop: [s.id for s in sample_nodes if json.loads(ts.population(s.population).metadata)['breed'] == pop] for pop in breeds}
samples_listed_by_breed = [ [s.id for s in sample_nodes if json.loads(ts.population(s.population).metadata)['breed'] == pop] for pop in breeds]
num_populations = ts.num_populations
breedPairs = [(x, y) for x in range(num_populations) for y in range(num_populations) if x < y]
windows = np.linspace(0, int(ts.sequence_length), num= int(ts.sequence_length) // 1_000)
tmrca = ts.divergence(sample_sets=samples_listed_by_breed, indexes=breedPairs, windows=windows, mode='branch') // 2
gnn = ts.genealogical_nearest_neighbours(
ts.samples(), samples_listed_by_breed
)
cols = {breed: gnn[:, u] for u, breed in enumerate(breeds)}
cols["breed"] = [json.loads(ts.population(ts.node(u).population).metadata)["breed"] for u in ts.samples()]
GnnDF = pd.DataFrame(cols)
GnnDF.to_csv(f"GnnDF_chr{chromosome}.csv")
Tajima by breed
popsTajima = pd.DataFrame()
for breed in breeds:
tajima = ts.Tajimas_D(sample_sets=samples_listed_by_breed_dict[breed])
tmp = pd.DataFrame({"TajimaD": [tajima], "Breed": breed})
popsTajima = pd.concat([popsTajima, tmp])
popsTajima.to_csv("Tajima_" + str(chromosome) + ".csv", index = None)
Window Tajima by breed:
popsTajima = pd.DataFrame()
for breed in breeds:
windowTajima = ts.Tajimas_D(sample_sets=samples_listed_by_breed_dict[breed], windows=windows)
tmp = pd.DataFrame({"Breakpoint": windows[:-1], "TajimaD": list(windowTajima), "Breed": breed})
popsTajima = pd.concat([popsTajima, tmp])
popsTajima.to_csv("WindowedTajima_" + str(chromosome) + ".csv", index = None)
DIversity by breed
popsDiversity = pd.DataFrame()
for breed in breeds:
diversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed])
tmp = pd.DataFrame({"Diversity": [diversity], "Breed": breed})
popsDiversity = pd.concat([popsDiversity, tmp])
popsDiversity.to_csv("Diversity_" +str(chromosome) + ".csv", index = None)
popsDiversity = pd.DataFrame()
for breed in breeds:
diversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed])
tmp = pd.DataFrame({"Diversity": [diversity], "Breed": breed})
popsDiversity = pd.concat([popsDiversity, tmp])
popsDiversity.to_csv("Diversity_" +str(chromosome) + ".csv", index = None)
Window diversity by breed
popsDiversity = pd.DataFrame()
for breed in breeds:
windowDiversity = ts.diversity(sample_sets=samples_listed_by_breed_dict[breed], windows=windows)
tmp = pd.DataFrame({"Breakpoint": windows[:-1], "Diversity": list(windowDiversity), "Breed": breed})
popsDiversity = pd.concat([popsDiversity, tmp])
popsDiversity.to_csv("WindowedDiversity_" +str(chromosome) + ".csv", index = None)
Fst by breed pairs
popsFst = pd.DataFrame()
for breed1, breed2 in breedPairs:
breedName1 = breeds[breed1]
breedName2 = breeds[breed2]
windowFst = ts.Fst(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]])
tmp = pd.DataFrame({"Fst": [windowFst], "Breed1": breedName1, "Breed2": breedName2})
popsFst = pd.concat([popsFst, tmp])
popsFst.to_csv("WindowedFst_" + str(chromosome) + ".csv", index = None)
# Divergence by breed pairs
popsDivergence = pd.DataFrame()
for breed1, breed2 in breedPairs:
breedName1 = breeds[breed1]
breedName2 = breeds[breed2]
divergence = ts.divergence(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]])
tmp = pd.DataFrame({"Divergence": [divergence], "Breed1": breedName1, "Breed2": breedName2})
popsDivergence = pd.concat([popsDivergence, tmp])
popsDivergence.to_csv("Divergence_" + str(chromosome) + ".csv", index = None)
# Window divergence by breed pairs
popsDivergence = pd.DataFrame()
for breed1, breed2 in breedPairs:
breedName1 = breeds[breed1]
breedName2 = breeds[breed2]
windowDivergence = ts.divergence(sample_sets=[samples_listed_by_breed[breed1], samples_listed_by_breed[breed2]], windows=windows)
tmp = pd.DataFrame({"Breakpoint": windows[:-1], "Divergence": list(windowDivergence), "Breed1": breedName1, "Breed2": breedName2})
popsDivergence = pd.concat([popsDivergence, tmp])
popsDivergence.to_csv("WindowedDivergence_" + str(chromosome) + ".csv", index = None)
breeds
['MAS', 'IDF', 'SWA', 'LLW', 'SUM', 'LAT', 'SAA', 'ALT', 'IAW', 'CHR']