import glob
import os
import json
from io import StringIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors
from utils import TREATMENTS, COLORS, DATASETS

genealogies = pd.concat([pd.read_csv(x) for x in DATASETS])
PATH_IMPORT = "raw/24-03-22/**/summary.json"
ignore = [
"/Breseq/WT_s/WT_a7/WT_e7/data", # Duplicated data
"/Breseq/17_s/Line_17/data",     # Duplicated data
"/Breseq/PX-10/",                # Unrelated data
"/Breseq/PE-5-WT-L-E3",          # Low coverage
"/Breseq/PE-4-WT-L-E3"           # Re-sequenced
]

EXPORT_PATH = '04_mutations'
EXPORT_TREE_PATH = '04_mutations/trees'

if not os.path.exists(EXPORT_PATH):
    os.mkdir(EXPORT_PATH)
if not os.path.exists(EXPORT_TREE_PATH):
    os.mkdir(EXPORT_TREE_PATH)

def read_breseq_summary(file):
    js = json.load(open(file,'r'))
    assert len(js['references']['reference'].keys()) == 1
    ref = list(js['references']['reference'].keys())[0]
    return dict(folder=file.replace('/data/summary.json','').replace('raw/24-03-22/',''),
                 name=file.split("/")[-3],
                 ref=ref,
                 coverage_ref=js['references']['reference'][ref]['coverage_average'],
                 fraction_aligned_reads=js['reads']['total_fraction_aligned_reads'],
                 n_reads=js['reads']['total_reads'],
                 read_files = ", ".join(js['reads']['read_file'].keys()),
                 n_read_files=len(js['reads']['read_file'].keys()))
        
def read_breseq_mutation_table(file):
    with open(file, 'r') as fh:
        html = fh.read()
    read = pd.read_html(StringIO(html.split('<!--Output Html_Mutation_Table_String-->')[1].split('</body>')[0]))[0]
    read.columns = [x[1] for x in read.columns]
    read['file'] = file
    return read

def parse_PE(x):
    try:
        x = x.replace('-','_')
        splitted = x.split('_')
        if x.startswith("Line_17"):
            geno = "LCS-" if "mutS" in x else "LCS+"
            return(dict(pe=x,
                        cycle=0, size="",
                        ancestor=geno, treatment="",
                        tube="", genotype=f"{geno} ancestor"))

        if "PE" in x and "Line" not in x:
            cycle = int(splitted[1])+1
            large = splitted[-1][0].upper() == splitted[-1][0] 
            tube = splitted[-1]
            geno = 'LCS-' if 'WT_' in x else 'LCS+'
            size = 'L' if large else 'S'

            try:
                name = f"{size}-{geno}_{cycle}-I-{tube}"
                parent = genealogies.query(f"name=='{name}'").parent.values[0].replace("LLCS_ROOT","LCS- ancestor")
            except Exception:
                parent = 'UNK'
            id_geno = f"{parent.replace('_',':')}→{cycle}-I-{tube}"

        elif splitted[0] in ("WT","17"):
            cycle = 11
            large = splitted[-1][0].upper() == splitted[-1][0] 
            size = 'L' if large else 'S'
            tube  = splitted[-1]
            geno  = {'WT':'LCS-',"17":"LCS+"}[splitted[0]]
            id_geno = f"{size}-{geno}: 10-II-{tube}→"
            parent = f"{size}-{geno}_10-II-{tube}"
        else:
            return {"pe":x, "microcosm":x}
        exp = size + '-' + geno
        return(dict(pe=x,
                    cycle=cycle, size=size,
                    parent=parent,
                        ancestor=geno, treatment=exp,
                        tube=tube, genotype=id_geno))

    except Exception as ex:
          return {'error':ex, "pe":x}

metadata = []
data = []
for s in glob.glob(PATH_IMPORT, recursive=True):
    if not any([x in s for x in ignore]):
        metadata.append(read_breseq_summary(s))
        metadata[-1].update(parse_PE(metadata[-1]['name'].replace('org','')))
        h = s.replace("data/summary.json", 'output/index.html')
        data.append(read_breseq_mutation_table(h))
        try:
            data[-1]['genotype'] = metadata[-1]['genotype']
        except: 
            print(metadata[-1])
            raise 
    else:
        print("Ignored:", s)
metadata = pd.DataFrame(metadata)
data = pd.concat(data)

Ignored: raw/24-03-22/Breseq/PE-4-WT-L-E3org/data/summary.json
Ignored: raw/24-03-22/Breseq/PX-10/PX_17_a6/data/summary.json
Ignored: raw/24-03-22/Breseq/PX-10/PX_17_b3/data/summary.json
Ignored: raw/24-03-22/Breseq/PX-10/PX_17_c3/data/summary.json
Ignored: raw/24-03-22/Breseq/PX-10/PX_17_e5/data/summary.json

Ignored: raw/24-03-22/Breseq/17_s/Line_17/data/summary.json

Ignored: raw/24-03-22/Breseq/WT_s/WT_a7/WT_e7/data/summary.json

Ignored: raw/24-03-22/Breseq/PE-4-WT-L-E3/data/summary.json
Ignored: raw/24-03-22/Breseq/PE-5-WT-L-E3/data/summary.json

# We consider that two mutations are the same if 
# ["position","mutation","annotation","gene","description"] are equal. 
# Two entries with the same values in those columns but different evidence are 
# considered the same mutation.
# Example: At position 1354000 there is a Δ9 bp deletion in wspB that is sometimes detected
# with JC sometimes with MC and JC
data.fillna({"annotation":''}, inplace=True)
cols = ["position","mutation","annotation","gene","description"]
count = data.groupby(cols).count().sort_values('genotype').genotype
count.name = "time_sequenced"

mutations = pd.pivot(data, index=cols, columns='genotype', values='evidence').fillna(0)
mutations['time_sequenced'] = count
mutations = mutations[sorted(mutations.columns, reverse=True)].reset_index()
mutations.columns.name = ''
mutations['mut_id'] = [f"MUT_{x:04}" for x in range(mutations.shape[0])]
mutations['name'] = mutations['mut_id']

# Checks that the mutations are unique
assert mutations.shape[0] == mutations.loc[:,cols].drop_duplicates().shape[0]

lcsp_ancestor = set(x.mut_id for k,x in mutations.iterrows() if x['LCS+ ancestor'])
lcsm_ancestor = set(x.mut_id for k,x in mutations.iterrows() if x['LCS- ancestor'])
all_sequences = set(mutations[mutations.time_sequenced==data.file.nunique()].mut_id)
any_ancestors = set.union(lcsm_ancestor, lcsp_ancestor)
both_ancestors = set.intersection(lcsm_ancestor, lcsp_ancestor)

print(f"There are:\n\t {mutations.shape[0]} unique mutations")
print(f"\t{len(lcsp_ancestor)} in LCS+ ancestor")
print(f"\t{len(lcsm_ancestor)} in LCS- ancestor")
print(f"\t{len(all_sequences)} in all sequencing")
print(f"\t{len(any_ancestors)} in any of the two ancestors")
print(f"\t{len(both_ancestors)} in any of the two ancestors")

There are:
	 9271 unique mutations
	58 in LCS+ ancestor
	74 in LCS- ancestor
	28 in all sequencing
	78 in any of the two ancestors
	54 in any of the two ancestors

# Count the number of mutations in metadata
cmut = data.groupby('genotype').count().mutation
metadata.set_index('genotype',inplace=True)
metadata["n_mutations"] = cmut

mut_mutations = mutations[['mut' in x.gene for k,x in mutations.iterrows()]].copy()
mut_mutations['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in mut_mutations.iterrows()]
col_mut = cols+["time_sequenced"]
mut_mutations = mut_mutations.drop(columns=col_mut).set_index('mutation_name').drop(columns=['mut_id','name']).transpose()

# EXPORT 
metadata.to_csv(os.path.join(EXPORT_PATH, "sequenced_cultures.csv"))
mutation_description = mutations.loc[:, ['mut_id', 'position', 'mutation',
                                         'annotation', 'gene', 'description',
                                         'time_sequenced', 'name']].set_index('mut_id')
mutation_description.to_csv(os.path.join(EXPORT_PATH,"mutation_description.csv"))

long_table = pd.merge(data, mutations.loc[:,cols+['mut_id']], left_on=cols, right_on=cols, )
long_table.to_csv(os.path.join(EXPORT_PATH,"sequencing_long_table.csv"))

wide_table = mutations.set_index('mut_id')
wide_table.to_csv(os.path.join(EXPORT_PATH,"sequencing_wide_table.csv"))

mut_mutations.to_csv(os.path.join(EXPORT_PATH,"mut_mutations_wide.csv"))

all_data = pd.merge(long_table, metadata, left_on="genotype", right_index=True)
for treat, df in all_data.groupby('treatment'):
    if treat:
        d = df.loc[:, ['parent', 'mut_id', 'cycle']].rename(columns={"parent":"node", "mut_id":"mutation"})
        anc = f"{treat.split('-',1)[1]} ancestor"
        r = all_data[all_data.genotype==anc].loc[:, ['genotype', 'mut_id']].rename(columns={"genotype":"node", "mut_id":"mutation"})
        r.replace(anc, treat+"_"+"ROOT", inplace=True)
        out = pd.concat([d,r]).reset_index(drop=True).fillna({'cycle':0})
        out['endpoint'] = [c == 11 or c == 0 for c in out.cycle]
        out.loc[:, ['node', 'mutation']].drop_duplicates().to_csv(os.path.join(EXPORT_TREE_PATH,treat+"_mutations.csv"))
        out.loc[out.endpoint==True, ['node', 'mutation']].drop_duplicates().to_csv(os.path.join(EXPORT_TREE_PATH,treat+"_mutations_endpoints.csv"))

import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
    metadata.drop(columns=['cycle','size','ancestor','treatment','tube','name','pe']).to_sql("sequenced_cultures", database, if_exists="replace", dtype={'genotype':"TEXT PRIMARY KEY"})
    mutation_description.drop(columns=['name']).to_sql("mutations", database, if_exists="replace", dtype={'mut_id':"TEXT PRIMARY KEY"})
    long_table.loc[:,["evidence","mut_id","genotype"]].to_sql("sequencing", database, if_exists="replace", index=False)

fig, ax = plt.subplots(1,1, figsize=(7,4))
for k, v in metadata.groupby(["ancestor", "size"]):
    ax.scatter(v.cycle-(1 if k[1] else 0)+(np.random.random(v.shape[0])-0.5)*0.8, v.n_mutations, 
               label=f"{k[1]}-{k[0]}" if k[1] else k[0]+' ancestor',
               color=COLORS[f"{k[1]}-{k[0]}"] if k[1] else 'k',
               marker='.' if k[1] else {"LCS+":"x","LCS-":"+"}[k[0]])
ax.legend()
ax.set(xlabel='Cycle', 
       xlim=(-0.5,10.5), 
       ylim=(0,420),
       ylabel="Number of mutations identified",
       xticks=np.arange(11))
for x in np.arange(11)+0.5:
    plt.axvline(x,color='k',ls='--', lw=1)
fig.savefig(os.path.join(EXPORT_PATH, 'number_of_mutations.svg'),bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH, 'number_of_mutations.pdf'),bbox_inches='tight')

filtered_out = long_table.copy()
print(f"{filtered_out.shape[0]} entries")
for anc,anc_name in (["LCS+ ancestor", "LCS+"], ["LCS- ancestor", "LCS-"]):
    anc_dt = frozenset(filtered_out[filtered_out.genotype==anc].mut_id)
    genotypes = frozenset(metadata[metadata.ancestor==anc_name].index)
    print(f"Ancestor {anc}: {len(anc_dt)} mutations to be filtered out of {len(genotypes)} datasets")
    
    idx = np.logical_not(np.logical_and([x in genotypes for x in filtered_out.genotype],
                                        [x in anc_dt for x in filtered_out.mut_id]))
    filtered_out = filtered_out.loc[idx]
    print(filtered_out.shape[0])
    
nmut2 = filtered_out.groupby("genotype").count().mut_id
metadata2 = metadata.copy()
metadata2['n_mutations'] = nmut2

fig, ax = plt.subplots(1,1, figsize=(7,4))
for k, v in metadata2.groupby(["ancestor", "size"]):
    if k[1]:
        ax.scatter(v.cycle-(1 if k[1] else 0)+(np.random.random(v.shape[0])-0.5)*0.8, v.n_mutations, 
                   label=f"{k[1]}-{k[0]}" if k[1] else k[0]+' ancestor',
                   color=COLORS[f"{k[1]}-{k[0]}"] if k[1] else 'k',
                   marker='.' if k[1] else {"LCS+":"x","LCS-":"+"}[k[0]])
ax.legend()
ax.set(xlabel='Cycle', 
       xlim=(-0.5,10.5), 
       #ylim=(-1,420),
       ylabel="Number of mutations identified",
       xticks=np.arange(11))
for x in np.arange(11)+0.5:
    plt.axvline(x,color='k',ls='--', lw=1)
plt.axhline(0,color='k',ls='--', lw=1)
fig.savefig(os.path.join(EXPORT_PATH, 'filtered_number_of_mutations.svg'),bbox_inches='tight')
fig.savefig(os.path.join(EXPORT_PATH, 'filtered_number_of_mutations.pdf'),bbox_inches='tight')

37819 entries
Ancestor LCS+ ancestor: 58 mutations to be filtered out of 108 datasets
32252
Ancestor LCS- ancestor: 74 mutations to be filtered out of 90 datasets
25965

cat = {'folder': 'Breseq', 'name': 'ID',
       'ref': 'Reference', 'coverage_ref': 'Reference', 
       'fraction_aligned_reads': 'Reads', 'n_reads': 'Reads',
       'read_files': 'Reads', 'n_read_files': 'Reads', 
       'cycle': 'ID', 'size': 'ID', 'ancestor': 'ID', 'treatment': 'ID',
       'tube': 'ID', 'genotype': 'ID', 'n_mutations':'Mutations'}
metadisp = metadata.copy().reset_index()
metadisp = pd.merge(metadisp, mut_mutations, left_on='genotype',right_index=True)
cols_mut = list(mut_mutations.columns) 
metadisp = metadisp[["name","genotype","treatment", 
        "n_mutations",
        "n_read_files", "n_reads", "fraction_aligned_reads",
        "ref","coverage_ref"]+cols_mut ]
metadisp.set_index('genotype',inplace=True)
metadisp.sort_values('n_mutations', inplace=True)
metadisp.columns = pd.MultiIndex.from_tuples([(cat[x], x)
                                              if x in cat 
                                              else (x.split()[0], x)
                                              for x in metadisp.columns
                                              ])
C0 = matplotlib.colors.to_hex('C0')
border =  [{'selector': 'th', 'props': 'border-left: 1px solid black'},
           {'selector': 'td', 'props': 'border-left: 1px solid black'}]
html = metadisp.style.set_table_styles({
    ('ID','name'): border
}, overwrite=False, axis=0)\
.set_sticky(axis="index")\
.set_sticky(axis="columns")\
.bar(subset=[('Reference','coverage_ref'), ('Reads','fraction_aligned_reads'),('Reads',"n_reads"),('Mutations',"n_mutations")], color=matplotlib.colors.to_hex('C0'))\
.map(lambda x: (f"background-color:{matplotlib.colors.to_hex(COLORS[x])}" if x in COLORS else ''), subset=[('ID','treatment')])\
.map(lambda x: (f"background-color:{C0};color:{C0};border-color:black" if x else 'color:white;border-color:black'), 
     subset=[(x.split()[0],x) for x in cols_mut])\
.map_index(lambda v: "max-width:3em;font-size:0.8em;" if 'mut' in v and len(v.split())>1 else '', axis=1)\
.to_html()

with open(os.path.join(EXPORT_PATH,'sequencing.html'),'w') as f:
    f.write("<html><head><meta charset=\"utf-8\"><title>Sequencing Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
    f.write(html)
    f.write("</body>")

key_col = lambda x: ((0,0) if ("ancestor" in x or 'ROOT' in x)
                     else (0,-1) if ("name" in x or "All" in x)
                     else ({'+':0,'-':1}[x[5]],{'S':0,'L':1}[x[0]], int(x.split(":")[1].split('-')[0])))

common_mut = mutations.set_index('mut_id').loc[list(any_ancestors),:].reset_index()
common_mut['All Sequencing'] = [m in all_sequences for m in common_mut.mut_id]
common_mut = common_mut.set_index(cols+['time_sequenced','mut_id']).drop(columns='name')
common_columns = sorted(list(common_mut.columns), key=key_col)
common_mut = common_mut.loc[:,common_columns].reset_index()

common_mut = common_mut.sort_values(['All Sequencing','LCS- ancestor', 'LCS+ ancestor','time_sequenced'], ascending=False).reset_index(drop=True)
html = common_mut.style.set_sticky(axis="index")\
.map(lambda x: (f"background-color:{C0};color:{C0};border-color:black" if x else 'color:white;border-color:black'), 
     subset=common_columns)\
.set_sticky(axis="columns")\
.to_html()

with open(os.path.join(EXPORT_PATH,'common_mutations.html'),'w') as f:
    f.write("<html><head><meta charset=\"utf-8\"><title>Sequencing Data</title><style> table, th, td {border: 1px solid} table{border-collapse: collapse;} thead{background-color:white;} </style></head><body>")
    f.write(html)
    f.write("</body>")

m_mut = mutations[['mut' in x for x in mutations.gene]].copy()
m_mut['mutation_name'] = [x.gene.split()[0]+' '+x.annotation.split()[0] for k,x in m_mut.iterrows()]

m_mut = m_mut.set_index(cols+['time_sequenced','mut_id',]).drop(columns=['name'])
sorted_cols = sorted(list(m_mut.columns), key=key_col)
m_mut = m_mut.loc[:,sorted_cols].reset_index().drop(columns='mut_id').set_index(cols+['mutation_name','time_sequenced'])

print("Latex Table:")
print(80*"-")
print(m_mut.reset_index().loc[:,cols].rename(columns=lambda x: x.capitalize().replace("_"," ").replace("Time", r"\# times")).set_index('Gene').to_latex().replace('_',r"\_"))
print(80*"-")

Latex Table:
--------------------------------------------------------------------------------
\begin{tabular}{lrlll}
\toprule
 & Position & Mutation & Annotation & Description \\
Gene &  &  &  &  \\
\midrule
mutY ← & 355688 & C→T & G153D (GGC→GAC) & A/G‑specific adenine glycosylase \\
mutL → & 586966 & C→T & S5L (TCG→TTG) & DNA mismatch repair protein \\
mutL → & 587050 & A→G & E33G (GAG→GGG) & DNA mismatch repair protein \\
mutL → & 587260 & T→G & L103R (CTG→CGG) & DNA mismatch repair protein \\
mutL → & 587555 & C→T & L201L (CTC→CTT) & DNA mismatch repair protein \\
mutL → & 587737 & A→G & N262S (AAC→AGC) & DNA mismatch repair protein \\
mutL → & 588126 & (G)5→6 & coding (1174/1902 nt) & DNA mismatch repair protein \\
mutL → & 588551 & C→T & A533A (GCC→GCT) & DNA mismatch repair protein \\
mutS ← & 1300356 & C→T & G707S (GGC→AGC) & DNA mismatch repair protein \\
mutS ← & 1300673 & +A & coding (1802/2592 nt) & DNA mismatch repair protein \\
mutS ← & 1300783 & A→G & R564R (CGT→CGC) & DNA mismatch repair protein \\
mutS ← & 1300986 & T→G & T497P (ACC→CCC) & DNA mismatch repair protein \\
mutS ← & 1301392 & A→G & R361R (CGT→CGC) & DNA mismatch repair protein \\
mutS ← & 1302049 & A→G & R142R (CGT→CGC) & DNA mismatch repair protein \\
mutS ← & 1302080 & T→C & N132S (AAC→AGC) & DNA mismatch repair protein \\
mutS ← / ← PFLU\_1165 & 1302576 & G→A & intergenic (‑102/+68) & DNA mismatch repair protein/putative membrane protein \\
mutM ← & 6343361 & A→G & S46P (TCC→CCC) & formamidopyrimidine‑DNA glycosylase \\
\bottomrule
\end{tabular}

--------------------------------------------------------------------------------

m_mut.sort_values('time_sequenced',ascending=False, inplace=True)
yticks = [f"{x[5]:>15} ({x[6]})" for x in m_mut.index]
xticks = [x for x in m_mut.columns]

fig, ax = plt.subplots(1,1, figsize=(30,5))
ax.matshow(m_mut.values!=0, cmap="bone_r")
ax.set(yticks=range(len(yticks)),
       yticklabels=yticks,
       xticks=range(len(xticks)),
       xticklabels=xticks);
ax.set_xticklabels(xticks, rotation=90);
fig.savefig(os.path.join(EXPORT_PATH,"sequenced_mut_mutations.pdf"), bbox_inches='tight')

Filtering out the mutations from the ancestors¶

A table for all sequencing data¶