In [1]:
import pandas as pd
import numpy as np
import json
from itertools import chain
import os
EXPORT_PATH = '01_genealogies/'
if not os.path.exists(EXPORT_PATH):
os.mkdir(EXPORT_PATH)
In [2]:
datasheet = pd.read_excel("raw/19-09-27/Counts_final.xlsx", sheet_name=None)
for k,v in datasheet.items():
ex = k.replace('.csv','').split('-')
v['experiment'] = ex[1] + '-' + ex[0].replace("WT","LCS-").replace("17","LCS+")
v.columns = [str(x).lower().replace(' ', '_') for x in v.columns]
v['tube'] = v.rack + v.lineage.map(str)
v['time'] = 2*v.cycle + v.phase
print("raw/Counts_final.xlsx",k, v['experiment'].count())
data = pd.concat(datasheet.values())
data = data.replace('na', np.nan)
data.reset_index(inplace=True, drop=True)
# We start the numbering of cycles at 1
data.cycle += 1
# Microcosm names
data['microcosm'] = [f"{row.cycle}-{['I','II'][row.phase-1]}-{row.tube}" for i,row in data.iterrows()]
# Give an unique identifier to all tubes:
data['name'] = [f"{row.experiment}_{row.microcosm}" for i,row in data.iterrows()]
# Ordering colums
data = data.set_index(['experiment','tube','time']).reset_index()
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed warn(msg) /home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed warn(msg) /home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed warn(msg)
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed warn(msg) /tmp/ipykernel_347938/1158667774.py:11: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` data = data.replace('na', np.nan)
raw/Counts_final.xlsx WT-S 960 raw/Counts_final.xlsx WT-L.csv 960 raw/Counts_final.xlsx 17-S 960 raw/Counts_final.xlsx 17-L.csv 960
In [3]:
# Let us drop the miss for now...
miss = data.condition!='MISS'
print('Found {} MISS'.format(miss.shape[0] - miss.sum()))
print(data[np.logical_not(miss)].loc[:,['experiment','tube','cycle','rack','phase']].groupby(['experiment','cycle','rack','phase']).count().rename(columns={'tube':'count'}))
miss_quality_str = 'Dropping entries with condition "MISS"\n'
miss_quality_str += str(data[np.logical_not(miss)][['name','experiment','tube','cycle','rack','phase',"condition"]].set_index('name'))
miss_quality_str += ('\n droped {} MISS'.format(miss.shape[0] - miss.sum()))
data = data[miss]
print('{} Entries -- droped {} MISS'.format(data.shape[0], miss.shape[0] - miss.sum()))
Found 32 MISS count experiment cycle rack phase L-LCS- 1 F 2 8 2 F 1 8 S-LCS- 5 a 1 8 2 8 3808 Entries -- droped 32 MISS
In [4]:
## Find the parent of all tubes.
if 'parent' in data.columns:
data.drop('parent',1, inplace=True)
data.drop('name_parent',1, inplace=True)
# Tube that were replaced: the parent is the one with replaced==tube at the previous generation
parent = data.loc[:,['experiment','time','tube','replaced']]
parent['parent'] = [row.replaced if row.replaced!=0 else row.tube for _,row in parent.iterrows()]
parent.drop('replaced',axis=1, inplace=True)
parent = pd.merge(parent,
data[['experiment','time','tube','name']].rename(columns={'tube':'parent'}),
on=['experiment','time','parent'],
how='right').rename(columns={'name':'name_parent'})
parent.drop('parent',axis=1, inplace=True)
parent.rename(columns={'name_parent':'parent'},inplace=True)
parent.time += 1
df = pd.merge(data,
parent,
right_on=["experiment","time","tube"],
left_on=["experiment","time","tube"],
how='left')
df.parent.isnull()
df.loc[df.time==1, 'parent'] = [exp+'_ROOT' for exp in df.loc[df.time==1, 'experiment']]
# Tube that were not replaced have their parent in the same tube.
df.fillna({'parent':df.tube},inplace=True)
assert data.shape[0] == df.shape[0]
data = df
df.tail()
Out[4]:
experiment | tube | time | rack | lineage | cycle | phase | mat | count_ws | count_sm | count_fz | extinct | replaced | condition | microcosm | name | parent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3803 | L-LCS+ | F4 | 20 | F | 4 | 10 | 2 | 0.0 | 10.0 | 84.0 | 0.0 | 0.0 | 0 | 0 | 10-II-F4 | L-LCS+_10-II-F4 | L-LCS+_10-I-F4 |
3804 | L-LCS+ | F5 | 20 | F | 5 | 10 | 2 | 0.0 | 1.0 | 88.0 | 0.0 | 0.0 | 0 | 0 | 10-II-F5 | L-LCS+_10-II-F5 | L-LCS+_10-I-F5 |
3805 | L-LCS+ | F6 | 20 | F | 6 | 10 | 2 | 1.0 | 4.0 | 176.0 | 0.0 | 0.0 | 0 | 0 | 10-II-F6 | L-LCS+_10-II-F6 | L-LCS+_10-I-F6 |
3806 | L-LCS+ | F7 | 20 | F | 7 | 10 | 2 | 0.0 | 1.0 | 60.0 | 0.0 | 0.0 | 0 | 0 | 10-II-F7 | L-LCS+_10-II-F7 | L-LCS+_10-I-F7 |
3807 | L-LCS+ | F8 | 20 | F | 8 | 10 | 2 | 0.0 | 3.0 | 56.0 | 0.0 | 0.0 | 0 | 0 | 10-II-F8 | L-LCS+_10-II-F8 | L-LCS+_10-I-F5 |
In [5]:
# Get the grand parent of a node (i.e the first ancestor with the same phase).
d = df.set_index('name')
parent_cycle = []
for k in data.name:
p = d.loc[k].parent
if p in d.index:
pp = d.loc[p].parent
elif 'ROOT' in p:
pp = p
else:
print('No grand-parent found for {} (Parent was {})'.format(k, p))
parent_cycle.append(pp)
data['parent_within_cycle'] = data['parent']
data['parent_previous_cycle'] = parent_cycle
No grand-parent found for S-LCS-_6-I-a1 (Parent was a1) No grand-parent found for S-LCS-_6-I-a2 (Parent was a2) No grand-parent found for S-LCS-_6-I-a3 (Parent was a3) No grand-parent found for S-LCS-_6-I-a4 (Parent was a4) No grand-parent found for S-LCS-_6-I-a5 (Parent was a5) No grand-parent found for S-LCS-_6-I-a6 (Parent was a6) No grand-parent found for S-LCS-_6-I-a7 (Parent was a7) No grand-parent found for S-LCS-_6-I-a8 (Parent was a8) No grand-parent found for L-LCS-_3-I-F1 (Parent was F1) No grand-parent found for L-LCS-_3-I-F2 (Parent was F2) No grand-parent found for L-LCS-_3-I-F3 (Parent was F3) No grand-parent found for L-LCS-_3-I-F4 (Parent was F4) No grand-parent found for L-LCS-_3-I-F5 (Parent was F5) No grand-parent found for L-LCS-_3-I-F6 (Parent was F6) No grand-parent found for L-LCS-_3-I-F7 (Parent was F7) No grand-parent found for L-LCS-_3-I-F8 (Parent was F8)
In [6]:
# Convert ws and sm counts
data.loc[data.count_ws=='not counted','condition'] = 'WS not counted'
data.loc[data.count_ws=='not countable','condition'] = [x+', WS not countable' if x != '0'
else 'WS not countable'
for x in data.loc[data.count_ws=='not countable','condition']]
data.loc[data.count_ws=='WS not selectable','condition'] = [x+', WS not selectable' if x != '0'
else 'WS not selectable'
for x in data.loc[data.count_ws=='WS not selectable','condition']]
data.loc[data.count_ws=='not counted','count_ws'] = np.nan
data.loc[data.count_ws=='not countable','count_ws'] = np.nan
data.loc[data.count_ws=='WS not selectable','count_ws'] = np.nan
data.count_ws = pd.Series(data.count_ws, dtype=float)
data.loc[data.count_sm=='not counted','condition'] = [x+', SM not counted' if x != '0'
else 'SM not counted'
for x in data.loc[data.count_sm=='not counted','condition']]
data.loc[data.count_sm=='not countable','condition'] = [x+', SM not countable' if x != '0'
else 'SM not countable'
for x in data.loc[data.count_sm=='not countable','condition']]
data.loc[data.count_sm=='SM not selectable','condition'] = [x+', SM not selectable' if x != '0' else 'SM not selectable' for x in data.loc[data.count_sm=='SM not selectable','condition']]
data.loc[data.count_sm=='not counted','count_sm'] = np.nan
data.loc[data.count_sm=='not countable','count_sm'] = np.nan
data.loc[data.count_sm=='SM not selectable','count_sm'] = np.nan
data.count_sm = pd.Series(data.count_sm, dtype=float)
In [7]:
## Quality control !
errors = {
'Phase 1 extinct but Mat = 1 and SM>0':
'extinct==1 and phase == 1 and mat==1 and count_sm>0',
'Extinct 1 but replaced = 0':
'extinct==1 and replaced == 0',
'Phase 2 extinct but WS>0':
'extinct==1 and phase == 2 and count_ws>0',
'No Phase 2 extinct but no WS':
'extinct==0 and phase == 2 and count_ws==0',
'No Phase 1 extinct but no SM':
'extinct==0 and phase == 1 and count_sm==0',
'No Phase 1 extinct but no mat':
'extinct==0 and phase == 1 and mat==0',
'Experimental troubles (Condition is not 0)':
'condition!=0',
'Condition NaN':
'condition!=condition',
}
odd = ''
i = 0
idata = data.set_index(["experiment","tube","time"])
for k,v in data.query('replaced!=0').loc[:,["experiment","tube","time","replaced",'name']].iterrows():
key = tuple((v[0],v[3],v[2]))
try:
if idata.loc[key,'extinct']:
i+=1
odd += "\n"+' '.join(('[',str(v[0]),
'Cycle:',str(idata.loc[key,'cycle']),
'Phase:', str(idata.loc[key,'phase']), ']',
str(v[3]), 'replaced ', str(v[1]), 'even though it was extinct.'))
except KeyError:
if not np.isnan(v[3]):
odd += '\n'.join(['','~~~','Replaced by invalid tube: ', str(v),'~~~'])
odd += '\n'
quality = ''
quality += '-'*50+'\n'
quality += 'GENEALOGY DATASET IS {} ENTRIES \n'.format(data.shape[0])
quality += str(data[['experiment','name']].groupby('experiment').count().rename(columns={'name':'entries'})) + '\n'
quality += '-'*50+'\n'
quality += miss_quality_str + '\n'
quality += '-'*50+'\n'
quality += '\n\n Oddities \n'
quality += '===== In {} case(s), {} =====\n'.format(i,"Replaced by an extinct tube")
quality += odd
quality += '-'*50+'\n\n'
for k,v in errors.items():
try:
dd = data.query(v)
except Exception as ex:
print(ex)
print(v)
quality += '-'*50+'\n'
quality += '===== In {} case(s), {} =====\n'.format(dd.shape[0],k)
if 0<dd.shape[0]<20:
quality += str(dd.loc[:,['name','experiment','tube','cycle','condition','phase']].set_index(['experiment','tube','cycle','phase']))
quality += '\n'
elif v=='condition!=0':
quality += str(dd.loc[:,['name','experiment','tube','condition']].groupby(['condition','experiment']).count().rename(columns={'tube':'count'}))
quality += '\n'
elif v=='condition!=condition':
quality += str(dd.loc[:,['name','experiment','tube','cycle','rack','phase']].groupby(['experiment','cycle','rack','phase']).count().rename(columns={'tube':'count'}))
quality += '\n'
quality += '(Query : {} )\n'.format(v)
quality += '-'*50+'\n'
quality += '\n'
print(quality)
with open(os.path.join(EXPORT_PATH,'quality.txt'),'w') as file:
file.write(quality)
-------------------------------------------------- GENEALOGY DATASET IS 3808 ENTRIES entries experiment L-LCS+ 960 L-LCS- 944 S-LCS+ 960 S-LCS- 944 -------------------------------------------------- Dropping entries with condition "MISS" experiment tube cycle rack phase condition name S-LCS-_5-I-a1 S-LCS- a1 5 a 1 MISS S-LCS-_5-I-a2 S-LCS- a2 5 a 1 MISS S-LCS-_5-I-a3 S-LCS- a3 5 a 1 MISS S-LCS-_5-I-a4 S-LCS- a4 5 a 1 MISS S-LCS-_5-I-a5 S-LCS- a5 5 a 1 MISS S-LCS-_5-I-a6 S-LCS- a6 5 a 1 MISS S-LCS-_5-I-a7 S-LCS- a7 5 a 1 MISS S-LCS-_5-I-a8 S-LCS- a8 5 a 1 MISS S-LCS-_5-II-a1 S-LCS- a1 5 a 2 MISS S-LCS-_5-II-a2 S-LCS- a2 5 a 2 MISS S-LCS-_5-II-a3 S-LCS- a3 5 a 2 MISS S-LCS-_5-II-a4 S-LCS- a4 5 a 2 MISS S-LCS-_5-II-a5 S-LCS- a5 5 a 2 MISS S-LCS-_5-II-a6 S-LCS- a6 5 a 2 MISS S-LCS-_5-II-a7 S-LCS- a7 5 a 2 MISS S-LCS-_5-II-a8 S-LCS- a8 5 a 2 MISS L-LCS-_2-I-F1 L-LCS- F1 2 F 1 MISS L-LCS-_2-I-F2 L-LCS- F2 2 F 1 MISS L-LCS-_2-I-F3 L-LCS- F3 2 F 1 MISS L-LCS-_2-I-F4 L-LCS- F4 2 F 1 MISS L-LCS-_2-I-F5 L-LCS- F5 2 F 1 MISS L-LCS-_2-I-F6 L-LCS- F6 2 F 1 MISS L-LCS-_2-I-F7 L-LCS- F7 2 F 1 MISS L-LCS-_2-I-F8 L-LCS- F8 2 F 1 MISS L-LCS-_1-II-F1 L-LCS- F1 1 F 2 MISS L-LCS-_1-II-F2 L-LCS- F2 1 F 2 MISS L-LCS-_1-II-F3 L-LCS- F3 1 F 2 MISS L-LCS-_1-II-F4 L-LCS- F4 1 F 2 MISS L-LCS-_1-II-F5 L-LCS- F5 1 F 2 MISS L-LCS-_1-II-F6 L-LCS- F6 1 F 2 MISS L-LCS-_1-II-F7 L-LCS- F7 1 F 2 MISS L-LCS-_1-II-F8 L-LCS- F8 1 F 2 MISS droped 32 MISS -------------------------------------------------- Oddities ===== In 0 case(s), Replaced by an extinct tube ===== -------------------------------------------------- -------------------------------------------------- ===== In 2 case(s), Phase 1 extinct but Mat = 1 and SM>0 ===== name condition experiment tube cycle phase L-LCS- D6 8 1 L-LCS-_8-I-D6 CON L-LCS+ B1 7 1 L-LCS+_7-I-B1 CON (Query : extinct==1 and phase == 1 and mat==1 and count_sm>0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), Extinct 1 but replaced = 0 ===== (Query : extinct==1 and replaced == 0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), Phase 2 extinct but WS>0 ===== (Query : extinct==1 and phase == 2 and count_ws>0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), No Phase 2 extinct but no WS ===== (Query : extinct==0 and phase == 2 and count_ws==0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), No Phase 1 extinct but no SM ===== (Query : extinct==0 and phase == 1 and count_sm==0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), No Phase 1 extinct but no mat ===== (Query : extinct==0 and phase == 1 and mat==0 ) -------------------------------------------------- -------------------------------------------------- ===== In 87 case(s), Experimental troubles (Condition is not 0) ===== name count condition experiment CON L-LCS+ 6 6 L-LCS- 13 13 S-LCS+ 10 10 S-LCS- 11 11 ERR L-LCS+ 1 1 L-LCS- 5 5 GLY L-LCS+ 8 8 L-LCS- 8 8 S-LCS+ 8 8 S-LCS- 16 16 SLUG L-LCS- 1 1 (Query : condition!=0 ) -------------------------------------------------- -------------------------------------------------- ===== In 0 case(s), Condition NaN ===== Empty DataFrame Columns: [name, count] Index: [] (Query : condition!=condition ) --------------------------------------------------
/tmp/ipykernel_347938/1509597062.py:25: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` key = tuple((v[0],v[3],v[2]))
In [8]:
def export(data, name):
data.set_index('name').to_csv(EXPORT_PATH+name+'.csv')
for phase in (1,2):
dd = data[data.phase==phase].copy()
dd['parent'] = dd['parent_previous_cycle']
dd.set_index('name').to_csv(EXPORT_PATH+'{}_p{}.csv'.format(name, phase))
for exp,df in data.groupby('experiment'):
if exp == "L-LCS-":
df = df.query('rack!="F"|time!=5')
if exp == "S-LCS-":
df = df.query('rack!="a"|time!=11')
export(df, exp)
In [9]:
import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
data.to_sql("genealogies", database, if_exists="replace", index=False,
dtype={'name':"TEXT PRIMARY KEY"})
In [ ]: