Explore threads metadata#
Threads metadata has been added to the threads TreeSequence with the annotate_tree
script. Let’s explore data and check if everything is fine.
import itertools
import cyvcf2
import tszip
import pandas as pd
import numpy as np
from tskitetude import get_data_dir, get_project_dir
Let’s load information on sample names:
sample_info = pd.read_csv(
get_data_dir() / "toInfer/tsm100M300I.sample_names.txt",
sep="\t",
header=None,
names=["population", "individual"]
)
sample_info.head()
| population | individual | |
|---|---|---|
| 0 | A | tsk_486 |
| 1 | II | tsk_276 |
| 2 | A | tsk_506 |
| 3 | II | tsk_115 |
| 4 | D | tsk_1388 |
Now open the threads object:
Warning
where are opening the fitted version of the trees, since they contain variant infomation
ts_threads = tszip.load(get_project_dir() / "results-threads/toInfer-fit/threads/ts300I25k.1.trees.tsz")
ts_threads
|
|
|
|---|---|
| Trees | 24 857 |
| Sequence Length | 99 988 756 |
| Time Units | unknown |
| Sample Nodes | 3 200 |
| Total Size | 354.1 MiB |
| Metadata |
dictchromosome: 1offset: 6714 |
| Table | Rows | Size | Has Metadata |
|---|---|---|---|
| Edges | 8 329 337 | 254.2 MiB | |
| Individuals | 1 600 | 80.8 KiB | ✅ |
| Migrations | 0 | 8 Bytes | |
| Mutations | 24 924 | 900.6 KiB | |
| Nodes | 1 304 983 | 34.8 MiB | |
| Populations | 8 | 271 Bytes | ✅ |
| Provenances | 1 | 385 Bytes | |
| Sites | 24 924 | 608.5 KiB |
| Provenance Timestamp | Software Name | Version | Command | Full record |
|---|---|---|---|---|
| 17 December, 2025 at 08:29:37 PM | threads | 0.2.1 | Unknown |
Detailsdict
software:
dictname: threadsversion: 0.2.1
parameters:
dictinput_file: ts300I25k.1.biallelic.vcf.gzmetadata_added: True populations_added: 8 individuals_added: 1600 timestamp: 2025-12- 17T20:29:37.025492+00:00 description: Tree sequence annotated with population and individual metadata |
Explore the metadata of the first 5 nodes:
for node in itertools.islice(ts_threads.nodes(), 5):
print(f"Node {node.id}: population={ts_threads.population(node.population)}, individual={ts_threads.individual(node.individual)}")
Node 0: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 1: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 2: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 3: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 4: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=2, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([4, 5], dtype=int32), metadata={'sample_id': 'tsk_107'})
Ok, seems that metadata follow the sample order in vcf file. Let’s prove it by checking
the sample names in ts_threads.individuals():
print(f"Num. samples: {ts_threads.num_samples}")
print(f"Num. individuals: {ts_threads.num_individuals}")
sample2nodes = {}
for i, individual in enumerate(ts_threads.individuals()):
if i < 5:
print(f"Individual {individual.id}: sample='{individual.metadata["sample_id"]}', nodes={individual.nodes.tolist()}")
sample2nodes[individual.metadata["sample_id"]] = individual.nodes.tolist()
Num. samples: 3200
Num. individuals: 1600
Individual 0: sample='tsk_105', nodes=[0, 1]
Individual 1: sample='tsk_106', nodes=[2, 3]
Individual 2: sample='tsk_107', nodes=[4, 5]
Individual 3: sample='tsk_108', nodes=[6, 7]
Individual 4: sample='tsk_109', nodes=[8, 9]
Let’s discover individual breeds using their node information:
sample2breed = {}
for sample_id, nodes in sample2nodes.items():
for node in nodes:
sample2breed.setdefault(
sample_id, set()
).add(ts_threads.population(ts_threads.node(node).population).metadata["breed"])
Test that sample and breed mapping is correct:
# test that sample and breed mapping is correct:
assert all(
sample_info.loc[sample_info["individual"] == sample_id, "population"].values[0] in breed_set
for sample_id, breed_set in sample2breed.items()
), "Sample to breed mapping is incorrect!"
Now test that the genotypes are consistent with sample names: using VCF as a reference,
read the focal sample generated through the pipeline:
vcf_file = get_project_dir() / "results-threads/toInfer/focal/ts300I25k.1.vcf.gz"
with cyvcf2.VCF(vcf_file) as vcf_reader:
print(vcf_reader.samples[:5]) # show first 5 sample names in VCF
['tsk_105', 'tsk_106', 'tsk_107', 'tsk_108', 'tsk_109']
Check that the number of variant is the same:
# count number of variants in VCF file
num_variants = sum(1 for _ in cyvcf2.VCF(vcf_file))
print(f"N. of variants in VCF: {num_variants}")
# count number of variants in TS file
num_ts_variants = ts_threads.num_sites
print(f"N. of variants TreeSequence: {num_ts_variants}")
# check variant size
if num_variants == num_ts_variants:
print(f"✓ N. of variants matches (between TS and VCF)!")
else:
print(f"✗ Warning: VCF has {num_variants} variants, TreeSequence has {num_ts_variants}")
N. of variants in VCF: 25000
N. of variants TreeSequence: 24924
✗ Warning: VCF has 25000 variants, TreeSequence has 24924
Ok, positions are not the same: for what I saw, first position in ts_threads is 0,
so let’s read first position on vcf file and determine the offset of TS object:
with cyvcf2.VCF(vcf_file) as vcf_reader:
variant = next(vcf_reader)
offset = variant.POS
# translate the TS positions
ts_positions = np.array([int(site.position) + offset for site in ts_threads.sites()], dtype=np.int64)
print(ts_positions[:10]) # show first 10 positions
# read positions from VCF
with cyvcf2.VCF(vcf_file) as vcf_reader:
vcf_positions = np.array([variant.POS for variant in vcf_reader])
print(vcf_positions[:10]) # show first 10 positions
[ 6714 7016 8568 15841 19058 19766 20650 24037 29977 37199]
[ 6714 7016 8568 15841 19058 19766 20650 24037 29977 37199]
# Find which variants are missing from the TreeSequence
# Get positions from both VCF and TreeSequence
print(f"TreeSequence positions: {len(ts_positions)} variants")
print(f"VCF positions: {len(vcf_positions)} variants")
print("\nExtracting variant positions from VCF...")
# collect information on variants
vcf_variants_info = []
with cyvcf2.VCF(vcf_file) as vcf_reader:
for variant in vcf_reader:
vcf_variants_info.append({
'chrom': variant.CHROM,
'pos': variant.POS,
'ref': variant.REF,
'alt': ','.join(variant.ALT) if variant.ALT else '',
'qual': variant.QUAL,
'filter': variant.FILTER if variant.FILTER else 'PASS',
})
# create a dataframe from variant information
vcf_variants_df = pd.DataFrame(vcf_variants_info)
# Find missing positions (in VCF but not in TreeSequence)
missing_in_ts = np.setdiff1d(vcf_positions, ts_positions)
print(f"\nVariants in VCF but missing in TreeSequence: {len(missing_in_ts)}")
if len(missing_in_ts) > 0:
# filter out missing variants
missing_variants_df = vcf_variants_df[vcf_variants_df['pos'].isin(missing_in_ts)].copy()
genotype_stats = []
with cyvcf2.VCF(vcf_file) as vcf_reader:
for variant in vcf_reader:
if variant.POS in missing_in_ts:
gts = np.array([gt[:2] for gt in variant.genotypes])
unique_alleles = np.unique(gts[gts >= 0]) # Exclude missing (-1)
genotype_stats.append({
'pos': variant.POS,
'unique_alleles': len(unique_alleles),
'is_monomorphic': len(unique_alleles) == 1,
'missing_gts': np.sum(gts == -1),
'total_gts': gts.size,
'allele_counts': dict(zip(*np.unique(gts.flatten(), return_counts=True)))
})
genotype_stats_df = pd.DataFrame(genotype_stats)
# join information
missing_variants_full = missing_variants_df.merge(genotype_stats_df, on='pos')
# uncomment to save data
# missing_variants_full.to_csv('missing_variants.csv', index=False)
else:
print("\n✓ All VCF variants are present in TreeSequence")
# Also check if TreeSequence has any variants not in VCF (unlikely but possible)
extra_in_ts = np.setdiff1d(ts_positions, vcf_positions)
if len(extra_in_ts) > 0:
extra_df = pd.DataFrame({'pos': extra_in_ts})
TreeSequence positions: 24924 variants
VCF positions: 25000 variants
Extracting variant positions from VCF...
Variants in VCF but missing in TreeSequence: 76
Let’s explore those variants missing in the threads TreeSequence:
missing_variants_full
| chrom | pos | ref | alt | qual | filter | unique_alleles | is_monomorphic | missing_gts | total_gts | allele_counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1359872 | C | T | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 1 | 1 | 2602623 | C | A | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 2 | 1 | 3132835 | T | C | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 3 | 1 | 3813782 | G | A,T | None | PASS | 3 | False | 0 | 3200 | {0: 111, 1: 470, 2: 2619} |
| 4 | 1 | 5065536 | C | T | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 71 | 1 | 94983763 | A | T | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 72 | 1 | 96413425 | T | G | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 73 | 1 | 98046345 | C | T,G | None | PASS | 3 | False | 0 | 3200 | {0: 102, 1: 3037, 2: 61} |
| 74 | 1 | 99412716 | G | C | None | PASS | 1 | True | 0 | 3200 | {0: 3200} |
| 75 | 1 | 99701719 | T | C,G | None | PASS | 3 | False | 0 | 3200 | {0: 70, 1: 2987, 2: 143} |
76 rows × 11 columns
# Count monomorphic variants
monomorphic_count = missing_variants_full['is_monomorphic'].sum()
print(f"Number of monomorphic variants: {monomorphic_count}")
# Group by unique_alleles and count
allele_counts = missing_variants_full.groupby('unique_alleles').size()
print(f"\nCounts by number of unique alleles:")
print(allele_counts)
# More detailed breakdown
print(f"\nDetailed breakdown:")
for n_alleles, count in allele_counts.items():
percentage = 100 * count / len(missing_variants_full)
print(f" {n_alleles} unique allele(s): {count} variants ({percentage:.1f}%)")
Number of monomorphic variants: 53
Counts by number of unique alleles:
unique_alleles
1 53
2 1
3 22
dtype: int64
Detailed breakdown:
1 unique allele(s): 53 variants (69.7%)
2 unique allele(s): 1 variants (1.3%)
3 unique allele(s): 22 variants (28.9%)
So the missing variants are monomorphic SNPs (no alternates) and SNPs with more than 2 alleles. What about the SNP with two alleles? let’s check it:
missing_variants_full[missing_variants_full["unique_alleles"] == 2]
| chrom | pos | ref | alt | qual | filter | unique_alleles | is_monomorphic | missing_gts | total_gts | allele_counts | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 16 | 1 | 14337436 | G | A,C | None | PASS | 2 | False | 0 | 3200 | {0: 3152, 1: 48} |
Ok, is a SNP with tree alleles, but no allele reference is found in samples, so this is why has 2 unique alleles instead of 3. Let’s determine the position to skip in order to check genotypes:
skip_positions = missing_variants_full["pos"].tolist()
Extract TreeSequences Genotypes#
Collect the full genotype matrix from TS object:
# collect a matrix of (num_sites, num_samples)
ts_genotype_matrix = ts_threads.genotype_matrix()
print(f"Genotype matrix from TS: {ts_genotype_matrix.shape}")
print(f"Number of sites: {ts_genotype_matrix.shape[0]}")
print(f"Number of samples: {ts_genotype_matrix.shape[1]}")
print(f"\nSubsetting first 5 objects in both dimensions")
print(ts_genotype_matrix[:5, :5])
Genotype matrix from TS: (24924, 3200)
Number of sites: 24924
Number of samples: 3200
Subsetting first 5 objects in both dimensions
[[1 1 0 1 1]
[1 1 0 1 1]
[0 0 0 0 0]
[1 0 0 1 0]
[0 0 0 0 0]]
Extract VCF genotypes#
Collect the full genotype matrix from VCF (remember to skip filtered variants):
with cyvcf2.VCF(vcf_file) as vcf_reader:
print("\nExtracting genotypes from VCF...")
genotype_list = []
for i, variant in enumerate(vcf_reader):
if variant.POS in skip_positions:
continue # those variants are missing in TS
# Extract alleles as a numpy array directly
# variant.genotypes is a list, we take only the first 2 elements (alleles)
# Unnest genotypes: create a (num_samples * 2,) flat array for this variant
genotypes_array = np.array([gt[:2] for gt in variant.genotypes], dtype=np.int8)
genotype_list.append(genotypes_array.flatten())
if (i + 1) % 10000 == 0:
print(f" Processed {i + 1} variants...")
# Convert to a 2D numpy array: dimensions (num_variants, num_samples * 2)
vcf_genotype_matrix = np.array(genotype_list, dtype=np.int8)
print(f"\nVCF genotype matrix created:")
print(f" Shape: {vcf_genotype_matrix.shape}")
print(f" Dtype: {vcf_genotype_matrix.dtype}")
print(f" Number of variants: {vcf_genotype_matrix.shape[0]}")
print(f" Number of samples: {vcf_genotype_matrix.shape[1]}")
print(f"\nExample - first 5 genotypes of the first sample:")
print(vcf_genotype_matrix[:5, :5])
Extracting genotypes from VCF...
Processed 10000 variants...
Processed 20000 variants...
VCF genotype matrix created:
Shape: (24924, 3200)
Dtype: int8
Number of variants: 24924
Number of samples: 3200
Example - first 5 genotypes of the first sample:
[[1 1 0 1 1]
[1 1 0 1 1]
[0 0 0 0 0]
[1 0 0 1 0]
[0 0 0 0 0]]
Compare genotype matrices#
Compare the two genotype matrices:
# Check matching between VCF and TreeSequence genotype matrices: test for shape and values
if vcf_genotype_matrix.shape == ts_genotype_matrix.shape:
print(f"✓ Matrices have the same shape: {vcf_genotype_matrix.shape}")
# Both matrices have shape (num_sites, num_samples * 2 intended as haploid individuals)
# test for site-wise matches
matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=1)
num_matches = np.sum(matches)
num_total = len(matches)
print(f"\nGlobal result:")
print(f" Sites with matching genotypes: {num_matches}/{num_total} ({100*num_matches/num_total:.2f}%)")
if num_matches != num_total:
# Find sites with mismatches
mismatches = np.where(~matches)[0]
print(f"\n Number of sites with mismatches: {len(mismatches)}")
print(f" First 5 sites with mismatches: {mismatches[:5]}")
# Analyze an example of mismatch
if len(mismatches) > 0:
site_idx = mismatches[0]
print(f"\n Example - Site {site_idx}:")
# Find which samples have mismatches
site_matches = vcf_genotype_matrix[site_idx] == ts_genotype_matrix[site_idx]
discordant_samples = np.where(~site_matches)[0]
print(f" Samples with mismatches: {len(discordant_samples)}")
if len(discordant_samples) > 0:
sample_idx = discordant_samples[0]
print(f" Sample {sample_idx}:")
print(f" VCF: {vcf_genotype_matrix[site_idx, sample_idx]}")
print(f" TreeSequence: {ts_genotype_matrix[site_idx, sample_idx]}")
else:
print("\n✓ All matrices match perfectly!")
# Statistics per sample
# Per-sample match stats (all sites must match for a sample to be "perfect")
sample_matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=0)
num_perfect_samples = int(np.sum(sample_matches))
print(f"\nSamples with all matching genotypes: {num_perfect_samples}/{ts_threads.num_samples}")
if num_perfect_samples != ts_threads.num_samples:
mismatched_samples = np.where(~sample_matches)[0]
print(f" Number of samples with mismatches: {len(mismatched_samples)}")
print(f" First 10 samples with mismatches: {mismatched_samples[:10].tolist()}")
else:
print(f"✗ Matrices have different shapes!")
print(f" VCF: {vcf_genotype_matrix.shape}")
print(f" TreeSequence: {ts_genotype_matrix.shape}")
✓ Matrices have the same shape: (24924, 3200)
Global result:
Sites with matching genotypes: 24924/24924 (100.00%)
✓ All matrices match perfectly!
Samples with all matching genotypes: 3200/3200