Explore threads metadata

Explore threads metadata#

Threads metadata has been added to the threads TreeSequence with the annotate_tree script. Let’s explore data and check if everything is fine.

import itertools

import cyvcf2
import tszip
import pandas as pd
import numpy as np

from tskitetude import get_data_dir, get_project_dir

Let’s load information on sample names:

sample_info = pd.read_csv(
    get_data_dir() / "toInfer/tsm100M300I.sample_names.txt",
    sep="\t",
    header=None,
    names=["population", "individual"]
)
sample_info.head()

	population	individual
0	A	tsk_486
1	II	tsk_276
2	A	tsk_506
3	II	tsk_115
4	D	tsk_1388

Now open the threads object:

Warning

where are opening the fitted version of the trees, since they contain variant infomation

ts_threads = tszip.load(get_project_dir() / "results-threads/toInfer-fit/threads/ts300I25k.1.trees.tsz")
ts_threads

Tree Sequence
Trees	24 857
Sequence Length	99 988 756
Time Units	unknown
Sample Nodes	3 200
Total Size	354.1 MiB
Metadata	dict chromosome: 1 offset: 6714

Table	Rows	Size	Has Metadata
Edges	8 329 337	254.2 MiB
Individuals	1 600	80.8 KiB	✅
Migrations	0	8 Bytes
Mutations	24 924	900.6 KiB
Nodes	1 304 983	34.8 MiB
Populations	8	271 Bytes	✅
Provenances	1	385 Bytes
Sites	24 924	608.5 KiB

Provenance Timestamp	Software Name	Version	Command	Full record
17 December, 2025 at 08:29:37 PM	threads	0.2.1	Unknown	Details dict software: dict name: threads version: 0.2.1 parameters: dict input_file: ts300I25k.1.biallelic.vcf.gz metadata_added: True populations_added: 8 individuals_added: 1600 timestamp: 2025-12- 17T20:29:37.025492+00:00 description: Tree sequence annotated with population and individual metadata

To cite this software, please consult the citation manual: https://tskit.dev/citation/

Explore the metadata of the first 5 nodes:

for node in itertools.islice(ts_threads.nodes(), 5):
    print(f"Node {node.id}: population={ts_threads.population(node.population)}, individual={ts_threads.individual(node.individual)}")

Node 0: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 1: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=0, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([0, 1], dtype=int32), metadata={'sample_id': 'tsk_105'})
Node 2: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 3: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=1, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([2, 3], dtype=int32), metadata={'sample_id': 'tsk_106'})
Node 4: population=Population(id=3, metadata={'breed': 'II'}), individual=Individual(id=2, flags=0, location=array([], dtype=float64), parents=array([], dtype=int32), nodes=array([4, 5], dtype=int32), metadata={'sample_id': 'tsk_107'})

Ok, seems that metadata follow the sample order in vcf file. Let’s prove it by checking the sample names in ts_threads.individuals():

print(f"Num. samples: {ts_threads.num_samples}")
print(f"Num. individuals: {ts_threads.num_individuals}")

sample2nodes = {}

for i, individual in enumerate(ts_threads.individuals()):
    if i < 5:
        print(f"Individual {individual.id}: sample='{individual.metadata["sample_id"]}', nodes={individual.nodes.tolist()}")

    sample2nodes[individual.metadata["sample_id"]] = individual.nodes.tolist()

Num. samples: 3200
Num. individuals: 1600
Individual 0: sample='tsk_105', nodes=[0, 1]
Individual 1: sample='tsk_106', nodes=[2, 3]
Individual 2: sample='tsk_107', nodes=[4, 5]
Individual 3: sample='tsk_108', nodes=[6, 7]
Individual 4: sample='tsk_109', nodes=[8, 9]

Let’s discover individual breeds using their node information:

sample2breed = {}

for sample_id, nodes in sample2nodes.items():
    for node in nodes:
        sample2breed.setdefault(
            sample_id, set()
        ).add(ts_threads.population(ts_threads.node(node).population).metadata["breed"])

Test that sample and breed mapping is correct:

# test that sample and breed mapping is correct:
assert all(
    sample_info.loc[sample_info["individual"] == sample_id, "population"].values[0] in breed_set
    for sample_id, breed_set in sample2breed.items()
), "Sample to breed mapping is incorrect!"

Now test that the genotypes are consistent with sample names: using VCF as a reference, read the focal sample generated through the pipeline:

vcf_file = get_project_dir() / "results-threads/toInfer/focal/ts300I25k.1.vcf.gz"
with cyvcf2.VCF(vcf_file) as vcf_reader:
    print(vcf_reader.samples[:5])  # show first 5 sample names in VCF

['tsk_105', 'tsk_106', 'tsk_107', 'tsk_108', 'tsk_109']

Check that the number of variant is the same:

# count number of variants in VCF file
num_variants = sum(1 for _ in cyvcf2.VCF(vcf_file))
print(f"N. of variants in VCF: {num_variants}")

# count number of variants in TS file
num_ts_variants = ts_threads.num_sites
print(f"N. of variants TreeSequence: {num_ts_variants}")

# check variant size
if num_variants == num_ts_variants:
    print(f"✓ N. of variants matches (between TS and VCF)!")
else:
    print(f"✗ Warning: VCF has {num_variants} variants, TreeSequence has {num_ts_variants}")

N. of variants in VCF: 25000
N. of variants TreeSequence: 24924
✗ Warning: VCF has 25000 variants, TreeSequence has 24924

Ok, positions are not the same: for what I saw, first position in ts_threads is 0, so let’s read first position on vcf file and determine the offset of TS object:

with cyvcf2.VCF(vcf_file) as vcf_reader:
    variant = next(vcf_reader)
    offset = variant.POS

# translate the TS positions
ts_positions = np.array([int(site.position) + offset for site in ts_threads.sites()], dtype=np.int64)
print(ts_positions[:10])  # show first 10 positions

# read positions from VCF
with cyvcf2.VCF(vcf_file) as vcf_reader:
    vcf_positions = np.array([variant.POS for variant in vcf_reader])
print(vcf_positions[:10])  # show first 10 positions

[ 6714  7016  8568 15841 19058 19766 20650 24037 29977 37199]

[ 6714  7016  8568 15841 19058 19766 20650 24037 29977 37199]

# Find which variants are missing from the TreeSequence
# Get positions from both VCF and TreeSequence
print(f"TreeSequence positions: {len(ts_positions)} variants")
print(f"VCF positions: {len(vcf_positions)} variants")

print("\nExtracting variant positions from VCF...")

# collect information on variants
vcf_variants_info = []

with cyvcf2.VCF(vcf_file) as vcf_reader:
    for variant in vcf_reader:
        vcf_variants_info.append({
            'chrom': variant.CHROM,
            'pos': variant.POS,
            'ref': variant.REF,
            'alt': ','.join(variant.ALT) if variant.ALT else '',
            'qual': variant.QUAL,
            'filter': variant.FILTER if variant.FILTER else 'PASS',
        })

# create a dataframe from variant information
vcf_variants_df = pd.DataFrame(vcf_variants_info)

# Find missing positions (in VCF but not in TreeSequence)
missing_in_ts = np.setdiff1d(vcf_positions, ts_positions)
print(f"\nVariants in VCF but missing in TreeSequence: {len(missing_in_ts)}")

if len(missing_in_ts) > 0:
    # filter out missing variants
    missing_variants_df = vcf_variants_df[vcf_variants_df['pos'].isin(missing_in_ts)].copy()

    genotype_stats = []

    with cyvcf2.VCF(vcf_file) as vcf_reader:
        for variant in vcf_reader:
            if variant.POS in missing_in_ts:
                gts = np.array([gt[:2] for gt in variant.genotypes])
                unique_alleles = np.unique(gts[gts >= 0])  # Exclude missing (-1)

                genotype_stats.append({
                    'pos': variant.POS,
                    'unique_alleles': len(unique_alleles),
                    'is_monomorphic': len(unique_alleles) == 1,
                    'missing_gts': np.sum(gts == -1),
                    'total_gts': gts.size,
                    'allele_counts': dict(zip(*np.unique(gts.flatten(), return_counts=True)))
                })

    genotype_stats_df = pd.DataFrame(genotype_stats)

    # join information
    missing_variants_full = missing_variants_df.merge(genotype_stats_df, on='pos')

    # uncomment to save data
    # missing_variants_full.to_csv('missing_variants.csv', index=False)

else:
    print("\n✓ All VCF variants are present in TreeSequence")

# Also check if TreeSequence has any variants not in VCF (unlikely but possible)
extra_in_ts = np.setdiff1d(ts_positions, vcf_positions)
if len(extra_in_ts) > 0:
    extra_df = pd.DataFrame({'pos': extra_in_ts})

TreeSequence positions: 24924 variants
VCF positions: 25000 variants

Extracting variant positions from VCF...

Variants in VCF but missing in TreeSequence: 76

Let’s explore those variants missing in the threads TreeSequence:

missing_variants_full

	chrom	pos	ref	alt	qual	filter	unique_alleles	is_monomorphic	missing_gts	total_gts	allele_counts
0	1	1359872	C	T	None	PASS	1	True	0	3200	{0: 3200}
1	1	2602623	C	A	None	PASS	1	True	0	3200	{0: 3200}
2	1	3132835	T	C	None	PASS	1	True	0	3200	{0: 3200}
3	1	3813782	G	A,T	None	PASS	3	False	0	3200	{0: 111, 1: 470, 2: 2619}
4	1	5065536	C	T	None	PASS	1	True	0	3200	{0: 3200}
...	...	...	...	...	...	...	...	...	...	...	...
71	1	94983763	A	T	None	PASS	1	True	0	3200	{0: 3200}
72	1	96413425	T	G	None	PASS	1	True	0	3200	{0: 3200}
73	1	98046345	C	T,G	None	PASS	3	False	0	3200	{0: 102, 1: 3037, 2: 61}
74	1	99412716	G	C	None	PASS	1	True	0	3200	{0: 3200}
75	1	99701719	T	C,G	None	PASS	3	False	0	3200	{0: 70, 1: 2987, 2: 143}

76 rows × 11 columns

# Count monomorphic variants
monomorphic_count = missing_variants_full['is_monomorphic'].sum()
print(f"Number of monomorphic variants: {monomorphic_count}")

# Group by unique_alleles and count
allele_counts = missing_variants_full.groupby('unique_alleles').size()
print(f"\nCounts by number of unique alleles:")
print(allele_counts)

# More detailed breakdown
print(f"\nDetailed breakdown:")
for n_alleles, count in allele_counts.items():
    percentage = 100 * count / len(missing_variants_full)
    print(f"  {n_alleles} unique allele(s): {count} variants ({percentage:.1f}%)")

Number of monomorphic variants: 53

Counts by number of unique alleles:
unique_alleles
1    53
2     1
3    22
dtype: int64

Detailed breakdown:
  1 unique allele(s): 53 variants (69.7%)
  2 unique allele(s): 1 variants (1.3%)
  3 unique allele(s): 22 variants (28.9%)

So the missing variants are monomorphic SNPs (no alternates) and SNPs with more than 2 alleles. What about the SNP with two alleles? let’s check it:

missing_variants_full[missing_variants_full["unique_alleles"] == 2]

	chrom	pos	ref	alt	qual	filter	unique_alleles	is_monomorphic	missing_gts	total_gts	allele_counts
16	1	14337436	G	A,C	None	PASS	2	False	0	3200	{0: 3152, 1: 48}

Ok, is a SNP with tree alleles, but no allele reference is found in samples, so this is why has 2 unique alleles instead of 3. Let’s determine the position to skip in order to check genotypes:

skip_positions = missing_variants_full["pos"].tolist()

Extract TreeSequences Genotypes#

Collect the full genotype matrix from TS object:

# collect a matrix of (num_sites, num_samples)
ts_genotype_matrix = ts_threads.genotype_matrix()
print(f"Genotype matrix from TS: {ts_genotype_matrix.shape}")
print(f"Number of sites: {ts_genotype_matrix.shape[0]}")
print(f"Number of samples: {ts_genotype_matrix.shape[1]}")
print(f"\nSubsetting first 5 objects in both dimensions")
print(ts_genotype_matrix[:5, :5])

Genotype matrix from TS: (24924, 3200)
Number of sites: 24924
Number of samples: 3200

Subsetting first 5 objects in both dimensions
[[1 1 0 1 1]
 [1 1 0 1 1]
 [0 0 0 0 0]
 [1 0 0 1 0]
 [0 0 0 0 0]]

Extract VCF genotypes#

Collect the full genotype matrix from VCF (remember to skip filtered variants):

with cyvcf2.VCF(vcf_file) as vcf_reader:
    print("\nExtracting genotypes from VCF...")

    genotype_list = []

    for i, variant in enumerate(vcf_reader):
        if variant.POS in skip_positions:
            continue  # those variants are missing in TS

        # Extract alleles as a numpy array directly
        # variant.genotypes is a list, we take only the first 2 elements (alleles)
        # Unnest genotypes: create a (num_samples * 2,) flat array for this variant
        genotypes_array = np.array([gt[:2] for gt in variant.genotypes], dtype=np.int8)
        genotype_list.append(genotypes_array.flatten())

        if (i + 1) % 10000 == 0:
            print(f"  Processed {i + 1} variants...")

    # Convert to a 2D numpy array: dimensions (num_variants, num_samples * 2)
    vcf_genotype_matrix = np.array(genotype_list, dtype=np.int8)

    print(f"\nVCF genotype matrix created:")
    print(f"  Shape: {vcf_genotype_matrix.shape}")
    print(f"  Dtype: {vcf_genotype_matrix.dtype}")
    print(f"  Number of variants: {vcf_genotype_matrix.shape[0]}")
    print(f"  Number of samples: {vcf_genotype_matrix.shape[1]}")

    print(f"\nExample - first 5 genotypes of the first sample:")
    print(vcf_genotype_matrix[:5, :5])

Extracting genotypes from VCF...

  Processed 10000 variants...

  Processed 20000 variants...

VCF genotype matrix created:
  Shape: (24924, 3200)
  Dtype: int8
  Number of variants: 24924
  Number of samples: 3200

Example - first 5 genotypes of the first sample:
[[1 1 0 1 1]
 [1 1 0 1 1]
 [0 0 0 0 0]
 [1 0 0 1 0]
 [0 0 0 0 0]]

Compare genotype matrices#

Compare the two genotype matrices:

# Check matching between VCF and TreeSequence genotype matrices: test for shape and values
if vcf_genotype_matrix.shape == ts_genotype_matrix.shape:
    print(f"✓ Matrices have the same shape: {vcf_genotype_matrix.shape}")

    # Both matrices have shape (num_sites, num_samples * 2 intended as haploid individuals)
    # test for site-wise matches
    matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=1)
    num_matches = np.sum(matches)
    num_total = len(matches)

    print(f"\nGlobal result:")
    print(f"  Sites with matching genotypes: {num_matches}/{num_total} ({100*num_matches/num_total:.2f}%)")

    if num_matches != num_total:
        # Find sites with mismatches
        mismatches = np.where(~matches)[0]
        print(f"\n  Number of sites with mismatches: {len(mismatches)}")
        print(f"  First 5 sites with mismatches: {mismatches[:5]}")

        # Analyze an example of mismatch
        if len(mismatches) > 0:
            site_idx = mismatches[0]
            print(f"\n  Example - Site {site_idx}:")

            # Find which samples have mismatches
            site_matches = vcf_genotype_matrix[site_idx] == ts_genotype_matrix[site_idx]
            discordant_samples = np.where(~site_matches)[0]

            print(f"    Samples with mismatches: {len(discordant_samples)}")
            if len(discordant_samples) > 0:
                sample_idx = discordant_samples[0]
                print(f"    Sample {sample_idx}:")
                print(f"      VCF:          {vcf_genotype_matrix[site_idx, sample_idx]}")
                print(f"      TreeSequence: {ts_genotype_matrix[site_idx, sample_idx]}")
    else:
        print("\n✓ All matrices match perfectly!")

    # Statistics per sample
    # Per-sample match stats (all sites must match for a sample to be "perfect")
    sample_matches = np.all(vcf_genotype_matrix == ts_genotype_matrix, axis=0)
    num_perfect_samples = int(np.sum(sample_matches))

    print(f"\nSamples with all matching genotypes: {num_perfect_samples}/{ts_threads.num_samples}")

    if num_perfect_samples != ts_threads.num_samples:
        mismatched_samples = np.where(~sample_matches)[0]
        print(f"  Number of samples with mismatches: {len(mismatched_samples)}")
        print(f"  First 10 samples with mismatches: {mismatched_samples[:10].tolist()}")
else:
    print(f"✗ Matrices have different shapes!")
    print(f"  VCF:          {vcf_genotype_matrix.shape}")
    print(f"  TreeSequence: {ts_genotype_matrix.shape}")

✓ Matrices have the same shape: (24924, 3200)

Global result:
  Sites with matching genotypes: 24924/24924 (100.00%)

✓ All matrices match perfectly!

Samples with all matching genotypes: 3200/3200