In [1]:
import pandas as pd
import numpy as np
import json
from itertools import chain
import os
EXPORT_PATH = '01_genealogies/'
if not os.path.exists(EXPORT_PATH):
    os.mkdir(EXPORT_PATH)
In [2]:
datasheet = pd.read_excel("raw/19-09-27/Counts_final.xlsx", sheet_name=None)
for k,v in datasheet.items():
    ex = k.replace('.csv','').split('-')
    v['experiment'] = ex[1] + '-' + ex[0].replace("WT","LCS-").replace("17","LCS+")
    v.columns = [str(x).lower().replace(' ', '_') for x in v.columns]
    v['tube'] = v.rack + v.lineage.map(str)
    v['time'] = 2*v.cycle + v.phase
    print("raw/Counts_final.xlsx",k, v['experiment'].count())

data = pd.concat(datasheet.values()) 
data = data.replace('na', np.nan)
data.reset_index(inplace=True, drop=True)

# We start the numbering of cycles at 1 
data.cycle += 1

# Microcosm names
data['microcosm'] = [f"{row.cycle}-{['I','II'][row.phase-1]}-{row.tube}" for i,row in data.iterrows()]

# Give an unique identifier to all tubes:
data['name'] = [f"{row.experiment}_{row.microcosm}" for i,row in data.iterrows()]

# Ordering colums 
data = data.set_index(['experiment','tube','time']).reset_index()
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed
  warn(msg)
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed
  warn(msg)
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed
  warn(msg)
/home/guilhem/research/2019_colgen/mat/lce-genealogies/venv/lib/python3.12/site-packages/openpyxl/worksheet/_reader.py:329: UserWarning: Unknown extension is not supported and will be removed
  warn(msg)
/tmp/ipykernel_347938/1158667774.py:11: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data = data.replace('na', np.nan)
raw/Counts_final.xlsx WT-S 960
raw/Counts_final.xlsx WT-L.csv 960
raw/Counts_final.xlsx 17-S 960
raw/Counts_final.xlsx 17-L.csv 960
In [3]:
# Let us drop the miss for now... 
miss = data.condition!='MISS' 
print('Found {} MISS'.format(miss.shape[0] - miss.sum()))
print(data[np.logical_not(miss)].loc[:,['experiment','tube','cycle','rack','phase']].groupby(['experiment','cycle','rack','phase']).count().rename(columns={'tube':'count'}))

miss_quality_str = 'Dropping entries with condition "MISS"\n'
miss_quality_str += str(data[np.logical_not(miss)][['name','experiment','tube','cycle','rack','phase',"condition"]].set_index('name'))
miss_quality_str += ('\n droped {} MISS'.format(miss.shape[0] - miss.sum()))

data = data[miss]
print('{} Entries -- droped {} MISS'.format(data.shape[0], miss.shape[0] - miss.sum()))
Found 32 MISS
                             count
experiment cycle rack phase       
L-LCS-     1     F    2          8
           2     F    1          8
S-LCS-     5     a    1          8
                      2          8
3808 Entries -- droped 32 MISS
In [4]:
## Find the parent of all tubes. 
if 'parent' in data.columns:
    data.drop('parent',1, inplace=True) 
    data.drop('name_parent',1, inplace=True) 

    
# Tube that were replaced: the parent is the one with replaced==tube at the previous generation
parent = data.loc[:,['experiment','time','tube','replaced']]
parent['parent'] = [row.replaced if row.replaced!=0 else row.tube for _,row in parent.iterrows()] 
parent.drop('replaced',axis=1, inplace=True) 
parent = pd.merge(parent,
                  data[['experiment','time','tube','name']].rename(columns={'tube':'parent'}),
                  on=['experiment','time','parent'],
                  how='right').rename(columns={'name':'name_parent'})
parent.drop('parent',axis=1, inplace=True) 
parent.rename(columns={'name_parent':'parent'},inplace=True)
parent.time += 1

df = pd.merge(data,
              parent,
              right_on=["experiment","time","tube"],
              left_on=["experiment","time","tube"], 
              how='left')
df.parent.isnull()

df.loc[df.time==1, 'parent'] = [exp+'_ROOT' for exp in df.loc[df.time==1, 'experiment']]

# Tube that were not replaced have their parent in the same tube.
df.fillna({'parent':df.tube},inplace=True)
assert data.shape[0] == df.shape[0]
data = df
df.tail()
Out[4]:
experiment tube time rack lineage cycle phase mat count_ws count_sm count_fz extinct replaced condition microcosm name parent
3803 L-LCS+ F4 20 F 4 10 2 0.0 10.0 84.0 0.0 0.0 0 0 10-II-F4 L-LCS+_10-II-F4 L-LCS+_10-I-F4
3804 L-LCS+ F5 20 F 5 10 2 0.0 1.0 88.0 0.0 0.0 0 0 10-II-F5 L-LCS+_10-II-F5 L-LCS+_10-I-F5
3805 L-LCS+ F6 20 F 6 10 2 1.0 4.0 176.0 0.0 0.0 0 0 10-II-F6 L-LCS+_10-II-F6 L-LCS+_10-I-F6
3806 L-LCS+ F7 20 F 7 10 2 0.0 1.0 60.0 0.0 0.0 0 0 10-II-F7 L-LCS+_10-II-F7 L-LCS+_10-I-F7
3807 L-LCS+ F8 20 F 8 10 2 0.0 3.0 56.0 0.0 0.0 0 0 10-II-F8 L-LCS+_10-II-F8 L-LCS+_10-I-F5
In [5]:
# Get the grand parent of a node (i.e the first ancestor with the same phase). 
d = df.set_index('name')
parent_cycle = []
for k in data.name:
    p = d.loc[k].parent
    if p in d.index:
        pp = d.loc[p].parent
    elif 'ROOT' in p:
        pp = p
    else:
        print('No grand-parent found for {} (Parent was {})'.format(k, p))
    parent_cycle.append(pp)
data['parent_within_cycle'] = data['parent']
data['parent_previous_cycle'] = parent_cycle
No grand-parent found for S-LCS-_6-I-a1 (Parent was a1)
No grand-parent found for S-LCS-_6-I-a2 (Parent was a2)
No grand-parent found for S-LCS-_6-I-a3 (Parent was a3)
No grand-parent found for S-LCS-_6-I-a4 (Parent was a4)
No grand-parent found for S-LCS-_6-I-a5 (Parent was a5)
No grand-parent found for S-LCS-_6-I-a6 (Parent was a6)
No grand-parent found for S-LCS-_6-I-a7 (Parent was a7)
No grand-parent found for S-LCS-_6-I-a8 (Parent was a8)
No grand-parent found for L-LCS-_3-I-F1 (Parent was F1)
No grand-parent found for L-LCS-_3-I-F2 (Parent was F2)
No grand-parent found for L-LCS-_3-I-F3 (Parent was F3)
No grand-parent found for L-LCS-_3-I-F4 (Parent was F4)
No grand-parent found for L-LCS-_3-I-F5 (Parent was F5)
No grand-parent found for L-LCS-_3-I-F6 (Parent was F6)
No grand-parent found for L-LCS-_3-I-F7 (Parent was F7)
No grand-parent found for L-LCS-_3-I-F8 (Parent was F8)
In [6]:
# Convert ws and sm counts
data.loc[data.count_ws=='not counted','condition'] = 'WS not counted'
data.loc[data.count_ws=='not countable','condition'] = [x+', WS not countable' if x != '0'
                                                        else 'WS not countable' 
                                                        for x in data.loc[data.count_ws=='not countable','condition']]
data.loc[data.count_ws=='WS not selectable','condition'] = [x+', WS not selectable' if x != '0'
                                                            else 'WS not selectable'
                                                            for x in data.loc[data.count_ws=='WS not selectable','condition']]
data.loc[data.count_ws=='not counted','count_ws'] = np.nan
data.loc[data.count_ws=='not countable','count_ws'] = np.nan
data.loc[data.count_ws=='WS not selectable','count_ws'] = np.nan
data.count_ws = pd.Series(data.count_ws, dtype=float)

data.loc[data.count_sm=='not counted','condition'] = [x+', SM not counted' if x != '0'
                                                      else 'SM not counted' 
                                                      for x in data.loc[data.count_sm=='not counted','condition']]
data.loc[data.count_sm=='not countable','condition'] = [x+', SM not countable' if x != '0'
                                                        else 'SM not countable' 
                                                        for x in data.loc[data.count_sm=='not countable','condition']]
data.loc[data.count_sm=='SM not selectable','condition'] = [x+', SM not selectable' if x != '0' else 'SM not selectable' for x in data.loc[data.count_sm=='SM not selectable','condition']]
data.loc[data.count_sm=='not counted','count_sm'] = np.nan
data.loc[data.count_sm=='not countable','count_sm'] = np.nan
data.loc[data.count_sm=='SM not selectable','count_sm'] = np.nan
data.count_sm = pd.Series(data.count_sm, dtype=float)
In [7]:
## Quality control !
errors = {
    'Phase 1 extinct but Mat = 1 and SM>0':
    'extinct==1 and phase == 1 and mat==1 and count_sm>0',
    'Extinct 1 but replaced = 0': 
    'extinct==1 and replaced == 0',
    'Phase 2 extinct but WS>0':  
    'extinct==1 and phase == 2 and count_ws>0',
    'No Phase 2 extinct but no WS': 
    'extinct==0 and phase == 2 and count_ws==0',
    'No Phase 1 extinct but no SM': 
    'extinct==0 and phase == 1 and count_sm==0',
    'No Phase 1 extinct but no mat': 
    'extinct==0 and phase == 1 and mat==0',
    'Experimental troubles (Condition is not 0)':
    'condition!=0',
    'Condition NaN':
    'condition!=condition',
}

odd = ''
i = 0
idata = data.set_index(["experiment","tube","time"])
for k,v in data.query('replaced!=0').loc[:,["experiment","tube","time","replaced",'name']].iterrows():
    key = tuple((v[0],v[3],v[2]))
    try:
        if idata.loc[key,'extinct']:         
                i+=1
                odd += "\n"+' '.join(('[',str(v[0]),
                              'Cycle:',str(idata.loc[key,'cycle']),
                              'Phase:', str(idata.loc[key,'phase']), ']',
                              str(v[3]), 'replaced ', str(v[1]), 'even though it was extinct.'))
    except KeyError:
        if not np.isnan(v[3]):
            odd += '\n'.join(['','~~~','Replaced by invalid tube: ', str(v),'~~~'])
odd += '\n'
quality = ''
quality += '-'*50+'\n'
quality += 'GENEALOGY DATASET IS {} ENTRIES \n'.format(data.shape[0])
quality += str(data[['experiment','name']].groupby('experiment').count().rename(columns={'name':'entries'})) + '\n'
quality += '-'*50+'\n'
quality += miss_quality_str + '\n'
quality += '-'*50+'\n'
quality += '\n\n Oddities \n' 
quality += '===== In {} case(s), {} =====\n'.format(i,"Replaced by an extinct tube")
quality += odd 
quality += '-'*50+'\n\n'

for k,v in errors.items():
    try:
        dd = data.query(v)
    except Exception as ex:
        print(ex)
        print(v)
        
    quality += '-'*50+'\n'
    quality += '===== In {} case(s), {} =====\n'.format(dd.shape[0],k)
    if 0<dd.shape[0]<20:
        quality += str(dd.loc[:,['name','experiment','tube','cycle','condition','phase']].set_index(['experiment','tube','cycle','phase']))
        quality += '\n'
    elif v=='condition!=0':
        quality += str(dd.loc[:,['name','experiment','tube','condition']].groupby(['condition','experiment']).count().rename(columns={'tube':'count'}))
        quality += '\n'
    elif v=='condition!=condition':
        quality += str(dd.loc[:,['name','experiment','tube','cycle','rack','phase']].groupby(['experiment','cycle','rack','phase']).count().rename(columns={'tube':'count'}))
        quality += '\n'
    quality += '(Query : {} )\n'.format(v)
    quality += '-'*50+'\n'
    quality += '\n'

print(quality)
with open(os.path.join(EXPORT_PATH,'quality.txt'),'w') as file:
    file.write(quality)
--------------------------------------------------
GENEALOGY DATASET IS 3808 ENTRIES 
            entries
experiment         
L-LCS+          960
L-LCS-          944
S-LCS+          960
S-LCS-          944
--------------------------------------------------
Dropping entries with condition "MISS"
               experiment tube  cycle rack  phase condition
name                                                       
S-LCS-_5-I-a1      S-LCS-   a1      5    a      1      MISS
S-LCS-_5-I-a2      S-LCS-   a2      5    a      1      MISS
S-LCS-_5-I-a3      S-LCS-   a3      5    a      1      MISS
S-LCS-_5-I-a4      S-LCS-   a4      5    a      1      MISS
S-LCS-_5-I-a5      S-LCS-   a5      5    a      1      MISS
S-LCS-_5-I-a6      S-LCS-   a6      5    a      1      MISS
S-LCS-_5-I-a7      S-LCS-   a7      5    a      1      MISS
S-LCS-_5-I-a8      S-LCS-   a8      5    a      1      MISS
S-LCS-_5-II-a1     S-LCS-   a1      5    a      2      MISS
S-LCS-_5-II-a2     S-LCS-   a2      5    a      2      MISS
S-LCS-_5-II-a3     S-LCS-   a3      5    a      2      MISS
S-LCS-_5-II-a4     S-LCS-   a4      5    a      2      MISS
S-LCS-_5-II-a5     S-LCS-   a5      5    a      2      MISS
S-LCS-_5-II-a6     S-LCS-   a6      5    a      2      MISS
S-LCS-_5-II-a7     S-LCS-   a7      5    a      2      MISS
S-LCS-_5-II-a8     S-LCS-   a8      5    a      2      MISS
L-LCS-_2-I-F1      L-LCS-   F1      2    F      1      MISS
L-LCS-_2-I-F2      L-LCS-   F2      2    F      1      MISS
L-LCS-_2-I-F3      L-LCS-   F3      2    F      1      MISS
L-LCS-_2-I-F4      L-LCS-   F4      2    F      1      MISS
L-LCS-_2-I-F5      L-LCS-   F5      2    F      1      MISS
L-LCS-_2-I-F6      L-LCS-   F6      2    F      1      MISS
L-LCS-_2-I-F7      L-LCS-   F7      2    F      1      MISS
L-LCS-_2-I-F8      L-LCS-   F8      2    F      1      MISS
L-LCS-_1-II-F1     L-LCS-   F1      1    F      2      MISS
L-LCS-_1-II-F2     L-LCS-   F2      1    F      2      MISS
L-LCS-_1-II-F3     L-LCS-   F3      1    F      2      MISS
L-LCS-_1-II-F4     L-LCS-   F4      1    F      2      MISS
L-LCS-_1-II-F5     L-LCS-   F5      1    F      2      MISS
L-LCS-_1-II-F6     L-LCS-   F6      1    F      2      MISS
L-LCS-_1-II-F7     L-LCS-   F7      1    F      2      MISS
L-LCS-_1-II-F8     L-LCS-   F8      1    F      2      MISS
 droped 32 MISS
--------------------------------------------------


 Oddities 
===== In 0 case(s), Replaced by an extinct tube =====

--------------------------------------------------

--------------------------------------------------
===== In 2 case(s), Phase 1 extinct but Mat = 1 and SM>0 =====
                                      name condition
experiment tube cycle phase                         
L-LCS-     D6   8     1      L-LCS-_8-I-D6       CON
L-LCS+     B1   7     1      L-LCS+_7-I-B1       CON
(Query : extinct==1 and phase == 1 and mat==1 and count_sm>0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), Extinct 1 but replaced = 0 =====
(Query : extinct==1 and replaced == 0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), Phase 2 extinct but WS>0 =====
(Query : extinct==1 and phase == 2 and count_ws>0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), No Phase 2 extinct but no WS =====
(Query : extinct==0 and phase == 2 and count_ws==0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), No Phase 1 extinct but no SM =====
(Query : extinct==0 and phase == 1 and count_sm==0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), No Phase 1 extinct but no mat =====
(Query : extinct==0 and phase == 1 and mat==0 )
--------------------------------------------------

--------------------------------------------------
===== In 87 case(s), Experimental troubles (Condition is not 0) =====
                      name  count
condition experiment             
CON       L-LCS+         6      6
          L-LCS-        13     13
          S-LCS+        10     10
          S-LCS-        11     11
ERR       L-LCS+         1      1
          L-LCS-         5      5
GLY       L-LCS+         8      8
          L-LCS-         8      8
          S-LCS+         8      8
          S-LCS-        16     16
SLUG      L-LCS-         1      1
(Query : condition!=0 )
--------------------------------------------------

--------------------------------------------------
===== In 0 case(s), Condition NaN =====
Empty DataFrame
Columns: [name, count]
Index: []
(Query : condition!=condition )
--------------------------------------------------


/tmp/ipykernel_347938/1509597062.py:25: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  key = tuple((v[0],v[3],v[2]))
In [8]:
def export(data, name):
    data.set_index('name').to_csv(EXPORT_PATH+name+'.csv')
    for phase in (1,2):
        dd = data[data.phase==phase].copy()
        dd['parent'] = dd['parent_previous_cycle']
        dd.set_index('name').to_csv(EXPORT_PATH+'{}_p{}.csv'.format(name, phase))
for exp,df in data.groupby('experiment'):
    if exp == "L-LCS-":
        df = df.query('rack!="F"|time!=5')
    if exp == "S-LCS-":
        df = df.query('rack!="a"|time!=11')
    export(df, exp)
In [9]:
import sqlite3
with sqlite3.connect("lce_data.sqlite") as database:
    data.to_sql("genealogies", database, if_exists="replace", index=False,
               dtype={'name':"TEXT PRIMARY KEY"})
In [ ]: