dag9_tenx_single_cell_immune_profiling.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import change_pipeline_status
  26. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import clean_up_files
  27. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import create_and_update_qc_pages
  28. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_metrices_to_collection
  29. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import generate_cell_sorted_bam_func
  30. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_velocyto_func
  31. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scvelo_for_sc_5p_func
  32. ## ARGS
  33. default_args = {
  34. 'owner': 'airflow',
  35. 'depends_on_past': False,
  36. 'start_date': days_ago(2),
  37. 'email_on_failure': False,
  38. 'email_on_retry': False,
  39. 'retries': 4,
  40. 'max_active_runs':10,
  41. 'catchup':True,
  42. 'retry_delay': timedelta(minutes=5),
  43. 'provide_context': True,
  44. }
  45. FEATURE_TYPE_LIST = \
  46. Variable.get('tenx_single_cell_immune_profiling_feature_types',default_var={})#.split(',')
  47. ## DAG
  48. dag = \
  49. DAG(
  50. dag_id='dag9_tenx_single_cell_immune_profiling',
  51. schedule_interval=None,
  52. tags=['hpc','analysis','tenx','sc'],
  53. default_args=default_args,
  54. concurrency=100,
  55. max_active_runs=20,
  56. orientation='LR')
  57. with dag:
  58. ## TASK
  59. fetch_analysis_info_and_branch = \
  60. BranchPythonOperator(
  61. task_id='fetch_analysis_info',
  62. dag=dag,
  63. queue='hpc_4G',
  64. params={'no_analysis_task':'no_analysis',
  65. 'analysis_description_xcom_key':'analysis_description',
  66. 'analysis_info_xcom_key':'analysis_info'},
  67. python_callable=fetch_analysis_info_and_branch_func)
  68. ## TASK
  69. configure_cellranger_run = \
  70. PythonOperator(
  71. task_id='configure_cellranger_run',
  72. dag=dag,
  73. queue='hpc_4G',
  74. trigger_rule='none_failed_or_skipped',
  75. params={'xcom_pull_task_id':'fetch_analysis_info',
  76. 'analysis_description_xcom_key':'analysis_description',
  77. 'analysis_info_xcom_key':'analysis_info',
  78. 'library_csv_xcom_key':'cellranger_library_csv'},
  79. python_callable=configure_cellranger_run_func)
  80. for analysis_name in FEATURE_TYPE_LIST.keys():
  81. ## TASK
  82. task_branch = \
  83. BranchPythonOperator(
  84. task_id=analysis_name,
  85. dag=dag,
  86. queue='hpc_4G',
  87. params={'xcom_pull_task_id':'fetch_analysis_info',
  88. 'analysis_info_xcom_key':'analysis_info',
  89. 'analysis_name':analysis_name,
  90. 'task_prefix':'run_trim'},
  91. python_callable=task_branch_function)
  92. run_trim_list = list()
  93. for run_id in range(0,10):
  94. ## TASK
  95. t = \
  96. PythonOperator(
  97. task_id='run_trim_{0}_{1}'.format(analysis_name,run_id),
  98. dag=dag,
  99. queue='hpc_4G',
  100. params={'xcom_pull_task_id':'fetch_analysis_info',
  101. 'analysis_info_xcom_key':'analysis_info',
  102. 'analysis_description_xcom_key':'analysis_description',
  103. 'analysis_name':analysis_name,
  104. 'run_id':run_id,
  105. 'r1-length':0,
  106. 'r2-length':0,
  107. 'fastq_input_dir_tag':'fastq_dir',
  108. 'use_ephemeral_space':True,
  109. 'fastq_output_dir_tag':'output_path'},
  110. python_callable=run_sc_read_trimmming_func)
  111. run_trim_list.append(t)
  112. ## TASK
  113. collect_trimmed_files = \
  114. DummyOperator(
  115. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  116. trigger_rule='none_failed_or_skipped',
  117. dag=dag)
  118. ## PIPELINE
  119. fetch_analysis_info_and_branch >> task_branch
  120. task_branch >> run_trim_list
  121. run_trim_list >> collect_trimmed_files
  122. collect_trimmed_files >> configure_cellranger_run
  123. ## TASK
  124. no_analysis = \
  125. DummyOperator(
  126. task_id='no_analysis',
  127. dag=dag)
  128. ## PIPELINE
  129. fetch_analysis_info_and_branch >> no_analysis
  130. ## TASK
  131. run_cellranger = \
  132. PythonOperator(
  133. task_id='run_cellranger',
  134. dag=dag,
  135. queue='hpc_64G16t24hr',
  136. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  137. 'analysis_description_xcom_key':'analysis_description',
  138. 'library_csv_xcom_key':'cellranger_library_csv',
  139. 'library_csv_xcom_pull_task':'configure_cellranger_run',
  140. 'cellranger_xcom_key':'cellranger_output',
  141. 'cellranger_options':['--localcores 16','--localmem 64']},
  142. python_callable=run_cellranger_tool)
  143. ## PIPELINE
  144. configure_cellranger_run >> run_cellranger
  145. ## TASK
  146. decide_analysis_branch = \
  147. BranchPythonOperator(
  148. task_id='decide_analysis_branch',
  149. dag=dag,
  150. queue='hpc_4G',
  151. python_callable=decide_analysis_branch_func,
  152. params={'load_cellranger_result_to_db_task':'load_cellranger_result_to_db',
  153. 'run_scanpy_for_sc_5p_task':'run_scanpy_for_sc_5p',
  154. 'run_scirpy_for_vdj_task':'run_scirpy_for_vdj',
  155. 'run_scirpy_for_vdj_b_task':'run_scirpy_for_vdj_b',
  156. 'run_scirpy_vdj_t_task':'run_scirpy_for_vdj_t',
  157. 'run_seurat_for_sc_5p_task':'run_seurat_for_sc_5p',
  158. 'convert_cellranger_bam_to_cram_task':'convert_cellranger_bam_to_cram',
  159. 'library_csv_xcom_key':'cellranger_library_csv',
  160. 'library_csv_xcom_pull_task':'configure_cellranger_run'})
  161. ## PIPELINE
  162. run_cellranger >> decide_analysis_branch
  163. ## TASK
  164. load_cellranger_result_to_db = \
  165. PythonOperator(
  166. task_id='load_cellranger_result_to_db',
  167. dag=dag,
  168. queue='hpc_4G',
  169. python_callable=load_cellranger_result_to_db_func,
  170. params={'analysis_description_xcom_pull_task':'fetch_analysis_info',
  171. 'analysis_description_xcom_key':'analysis_description',
  172. 'cellranger_xcom_key':'cellranger_output',
  173. 'cellranger_xcom_pull_task':'run_cellranger',
  174. 'collection_type':'CELLRANGER_MULTI',
  175. 'html_collection_type':'CELLRANGER_HTML',
  176. 'collection_table':'sample',
  177. 'xcom_collection_name_key':'sample_igf_id',
  178. 'genome_column':'genome_build',
  179. 'analysis_name':'cellranger_multi',
  180. 'output_xcom_key':'loaded_output_files',
  181. 'html_xcom_key':'html_report_file',
  182. 'html_report_file_name':'web_summary.html'})
  183. upload_cellranger_report_to_ftp = \
  184. PythonOperator(
  185. task_id='upload_cellranger_report_to_ftp',
  186. dag=dag,
  187. queue='hpc_4G',
  188. python_callable=ftp_files_upload_for_analysis,
  189. params={'xcom_pull_task':'load_cellranger_result_to_db',
  190. 'xcom_pull_files_key':'html_report_file',
  191. 'collection_name_task':'load_cellranger_result_to_db',
  192. 'collection_name_key':'sample_igf_id',
  193. 'collection_type':'FTP_CELLRANGER_HTML',
  194. 'collection_table':'sample',
  195. 'collect_remote_file':True})
  196. upload_cellranger_report_to_box = \
  197. PythonOperator(
  198. task_id='upload_cellranger_report_to_box',
  199. dag=dag,
  200. queue='hpc_4G',
  201. python_callable=upload_analysis_file_to_box,
  202. params={'xcom_pull_task':'load_cellranger_result_to_db',
  203. 'xcom_pull_files_key':'html_report_file',
  204. 'analysis_tag':'cellranger_multi'})
  205. upload_cellranger_results_to_irods = \
  206. PythonOperator(
  207. task_id='upload_cellranger_results_to_irods',
  208. dag=dag,
  209. queue='hpc_4G',
  210. python_callable=irods_files_upload_for_analysis,
  211. params={'xcom_pull_task':'load_cellranger_result_to_db',
  212. 'xcom_pull_files_key':'loaded_output_files',
  213. 'collection_name_key':'sample_igf_id',
  214. 'collection_name_task':'load_cellranger_result_to_db',
  215. 'analysis_name':'cellranger_multi'})
  216. ## PIPELINE
  217. decide_analysis_branch >> load_cellranger_result_to_db
  218. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  219. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  220. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  221. ## TASK
  222. run_scanpy_for_sc_5p = \
  223. PythonOperator(
  224. task_id='run_scanpy_for_sc_5p',
  225. dag=dag,
  226. queue='hpc_4G',
  227. python_callable=run_singlecell_notebook_wrapper_func,
  228. params={'cellranger_xcom_key':'cellranger_output',
  229. 'cellranger_xcom_pull_task':'run_cellranger',
  230. 'scanpy_timeout':1200,
  231. 'allow_errors':False,
  232. 'kernel_name':'python3',
  233. 'count_dir':'count',
  234. 'analysis_name':'scanpy',
  235. 'output_notebook_key':'scanpy_notebook',
  236. 'output_cellbrowser_key':'cellbrowser_dirs',
  237. 'output_scanpy_h5ad_key':'scanpy_h5ad',
  238. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  239. 'analysis_description_xcom_key':'analysis_description'})
  240. load_cellranger_gex_matrics_to_db = \
  241. PythonOperator(
  242. task_id='load_cellranger_gex_matrics_to_db',
  243. dag=dag,
  244. queue='hpc_4G',
  245. python_callable=load_cellranger_metrices_to_collection,
  246. params={'cellranger_xcom_key':'cellranger_output',
  247. 'cellranger_xcom_pull_task':'run_cellranger',
  248. 'collection_type':'CELLRANGER_MULTI',
  249. 'collection_name_task':'load_cellranger_result_to_db',
  250. 'collection_name_key':'sample_igf_id',
  251. 'metrics_summary_file':'metrics_summary.csv',
  252. 'attribute_prefix':'CELLRANGER_COUNT'})
  253. load_scanpy_report_for_sc_5p_to_db = \
  254. PythonOperator(
  255. task_id='load_scanpy_report_for_sc_5p_to_db',
  256. dag=dag,
  257. queue='hpc_4G',
  258. python_callable=load_analysis_files_func,
  259. params={'collection_name_task':'load_cellranger_result_to_db',
  260. 'collection_name_key':'sample_igf_id',
  261. 'file_name_task':'run_scanpy_for_sc_5p',
  262. 'file_name_key':'scanpy_notebook',
  263. 'analysis_name':'scanpy_5p',
  264. 'collection_type':'SCANPY_HTML',
  265. 'collection_table':'sample',
  266. 'output_files_key':'output_db_files'})
  267. upload_scanpy_report_for_sc_5p_to_ftp = \
  268. PythonOperator(
  269. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  270. dag=dag,
  271. queue='hpc_4G',
  272. python_callable=ftp_files_upload_for_analysis,
  273. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  274. 'xcom_pull_files_key':'output_db_files',
  275. 'collection_name_task':'load_cellranger_result_to_db',
  276. 'collection_name_key':'sample_igf_id',
  277. 'collection_type':'FTP_SCANPY_HTML',
  278. 'collection_table':'sample',
  279. 'collect_remote_file':True})
  280. upload_scanpy_report_for_sc_5p_to_box = \
  281. PythonOperator(
  282. task_id='upload_scanpy_report_for_sc_5p_to_box',
  283. dag=dag,
  284. queue='hpc_4G',
  285. python_callable=upload_analysis_file_to_box,
  286. params={'xcom_pull_task':'load_scanpy_report_for_sc_5p_to_db',
  287. 'xcom_pull_files_key':'output_db_files',
  288. 'analysis_tag':'scanpy_single_sample_report'})
  289. upload_cellbrowser_for_sc_5p_to_ftp = \
  290. PythonOperator(
  291. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  292. dag=dag,
  293. queue='hpc_4G',
  294. python_callable=ftp_files_upload_for_analysis,
  295. params={'xcom_pull_task':'run_scanpy_for_sc_5p',
  296. 'xcom_pull_files_key':'cellbrowser_dirs',
  297. 'collection_name_task':'load_cellranger_result_to_db',
  298. 'collection_name_key':'sample_igf_id',
  299. 'collection_type':'FTP_CELLBROWSER',
  300. 'collection_table':'sample',
  301. 'collect_remote_file':True})
  302. ## PIPELINE
  303. decide_analysis_branch >> run_scanpy_for_sc_5p
  304. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  305. run_scanpy_for_sc_5p >> load_cellranger_gex_matrics_to_db
  306. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  307. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  308. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  309. ## TASK
  310. run_scirpy_for_vdj_b = \
  311. PythonOperator(
  312. task_id='run_scirpy_for_vdj_b',
  313. dag=dag,
  314. queue='hpc_4G',
  315. python_callable=run_singlecell_notebook_wrapper_func,
  316. params={'cellranger_xcom_key':'cellranger_output',
  317. 'cellranger_xcom_pull_task':'run_cellranger',
  318. 'scanpy_timeout':1200,
  319. 'allow_errors':False,
  320. 'kernel_name':'python3',
  321. 'analysis_name':'scirpy',
  322. 'vdj_dir':'vdj_b',
  323. 'count_dir':'count',
  324. 'output_notebook_key':'scirpy_notebook',
  325. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  326. 'analysis_description_xcom_key':'analysis_description'})
  327. load_cellranger_vdjB_matrics_to_db = \
  328. PythonOperator(
  329. task_id='load_cellranger_vdjB_matrics_to_db',
  330. dag=dag,
  331. queue='hpc_4G',
  332. python_callable=load_cellranger_metrices_to_collection,
  333. params={'cellranger_xcom_key':'cellranger_output',
  334. 'cellranger_xcom_pull_task':'run_cellranger',
  335. 'collection_type':'CELLRANGER_MULTI',
  336. 'collection_name_task':'load_cellranger_result_to_db',
  337. 'collection_name_key':'sample_igf_id',
  338. 'metrics_summary_file':'metrics_summary.csv',
  339. 'attribute_prefix':'CELLRANGER_VDJB'})
  340. load_scirpy_report_for_vdj_b_to_db = \
  341. PythonOperator(
  342. task_id='load_scirpy_report_for_vdj_b_to_db',
  343. dag=dag,
  344. queue='hpc_4G',
  345. python_callable=load_analysis_files_func,
  346. params={'collection_name_task':'load_cellranger_result_to_db',
  347. 'collection_name_key':'sample_igf_id',
  348. 'file_name_task':'run_scirpy_for_vdj_b',
  349. 'file_name_key':'scirpy_notebook',
  350. 'analysis_name':'scirpy_vdj_b',
  351. 'collection_type':'SCIRPY_VDJ_B_HTML',
  352. 'collection_table':'sample',
  353. 'output_files_key':'output_db_files'})
  354. upload_scirpy_report_for_vdj_b_to_ftp = \
  355. PythonOperator(
  356. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  357. dag=dag,
  358. queue='hpc_4G',
  359. python_callable=ftp_files_upload_for_analysis,
  360. params={'xcom_pull_task':'load_scirpy_report_for_vdj_b_to_db',
  361. 'xcom_pull_files_key':'output_db_files',
  362. 'collection_name_task':'load_cellranger_result_to_db',
  363. 'collection_name_key':'sample_igf_id',
  364. 'collection_type':'FTP_SCIRPY_VDJ_B_HTML',
  365. 'collection_table':'sample',
  366. 'collect_remote_file':True})
  367. upload_scirpy_report_for_vdj_b_to_box = \
  368. PythonOperator(
  369. task_id='upload_scanpy_report_for_vdj_b_to_box',
  370. dag=dag,
  371. queue='hpc_4G',
  372. python_callable=upload_analysis_file_to_box,
  373. params={'xcom_pull_task':'load_scirpy_report_for_vdj_b_to_db',
  374. 'xcom_pull_files_key':'output_db_files',
  375. 'analysis_tag':'scirpy_vdj_b_single_sample_report'})
  376. ## PIPELINE
  377. decide_analysis_branch >> run_scirpy_for_vdj_b
  378. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  379. run_scirpy_for_vdj_b >> load_cellranger_vdjB_matrics_to_db
  380. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  381. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  382. ## TASK
  383. run_scirpy_for_vdj_t = \
  384. PythonOperator(
  385. task_id='run_scirpy_for_vdj_t',
  386. dag=dag,
  387. queue='hpc_4G',
  388. python_callable=run_singlecell_notebook_wrapper_func,
  389. params={'cellranger_xcom_key':'cellranger_output',
  390. 'cellranger_xcom_pull_task':'run_cellranger',
  391. 'scanpy_timeout':1200,
  392. 'allow_errors':False,
  393. 'kernel_name':'python3',
  394. 'analysis_name':'scirpy',
  395. 'vdj_dir':'vdj_t',
  396. 'count_dir':'count',
  397. 'output_notebook_key':'scirpy_notebook',
  398. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  399. 'analysis_description_xcom_key':'analysis_description'})
  400. load_cellranger_vdjT_matrics_to_db = \
  401. PythonOperator(
  402. task_id='load_cellranger_vdjT_matrics_to_db',
  403. dag=dag,
  404. queue='hpc_4G',
  405. python_callable=load_cellranger_metrices_to_collection,
  406. params={'cellranger_xcom_key':'cellranger_output',
  407. 'cellranger_xcom_pull_task':'run_cellranger',
  408. 'collection_type':'CELLRANGER_MULTI',
  409. 'collection_name_task':'load_cellranger_result_to_db',
  410. 'collection_name_key':'sample_igf_id',
  411. 'metrics_summary_file':'metrics_summary.csv',
  412. 'attribute_prefix':'CELLRANGER_VDJT'})
  413. load_scirpy_report_for_vdj_t_to_db = \
  414. PythonOperator(
  415. task_id='load_scirpy_report_for_vdj_t_to_db',
  416. dag=dag,
  417. queue='hpc_4G',
  418. python_callable=load_analysis_files_func,
  419. params={'collection_name_task':'load_cellranger_result_to_db',
  420. 'collection_name_key':'sample_igf_id',
  421. 'file_name_task':'run_scirpy_for_vdj_t',
  422. 'file_name_key':'scirpy_notebook',
  423. 'analysis_name':'scirpy_vdj_t',
  424. 'collection_type':'SCIRPY_VDJ_T_HTML',
  425. 'collection_table':'sample',
  426. 'output_files_key':'output_db_files'})
  427. upload_scirpy_report_for_vdj_t_to_ftp = \
  428. PythonOperator(
  429. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  430. dag=dag,
  431. queue='hpc_4G',
  432. python_callable=ftp_files_upload_for_analysis,
  433. params={'xcom_pull_task':'load_scirpy_report_for_vdj_t_to_db',
  434. 'xcom_pull_files_key':'output_db_files',
  435. 'collection_name_task':'load_cellranger_result_to_db',
  436. 'collection_name_key':'sample_igf_id',
  437. 'collection_type':'FTP_SCIRPY_VDJ_T_HTML',
  438. 'collection_table':'sample',
  439. 'collect_remote_file':True})
  440. upload_scirpy_report_for_vdj_t_to_box = \
  441. PythonOperator(
  442. task_id='upload_scirpy_report_for_vdj_t_to_box',
  443. dag=dag,
  444. queue='hpc_4G',
  445. python_callable=upload_analysis_file_to_box,
  446. params={'xcom_pull_task':'load_scirpy_report_for_vdj_t_to_db',
  447. 'xcom_pull_files_key':'output_db_files',
  448. 'analysis_tag':'scirpy_vdj_t_single_sample_report'})
  449. ## PIPELINE
  450. decide_analysis_branch >> run_scirpy_for_vdj_t
  451. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  452. run_scirpy_for_vdj_t >> load_cellranger_vdjT_matrics_to_db
  453. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  454. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  455. ## TASK
  456. run_seurat_for_sc_5p = \
  457. PythonOperator(
  458. task_id='run_seurat_for_sc_5p',
  459. dag=dag,
  460. queue='hpc_4G',
  461. python_callable=run_singlecell_notebook_wrapper_func,
  462. params={'cellranger_xcom_key':'cellranger_output',
  463. 'cellranger_xcom_pull_task':'run_cellranger',
  464. 'scanpy_timeout':1200,
  465. 'allow_errors':False,
  466. 'kernel_name':'ir',
  467. 'analysis_name':'seurat',
  468. 'vdj_dir':'vdj',
  469. 'count_dir':'count',
  470. 'output_notebook_key':'seurat_notebook',
  471. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  472. 'analysis_description_xcom_key':'analysis_description'})
  473. load_seurat_report_for_sc_5p_db = \
  474. PythonOperator(
  475. task_id='load_seurat_report_for_sc_5p_db',
  476. dag=dag,
  477. queue='hpc_4G',
  478. python_callable=load_analysis_files_func,
  479. params={'collection_name_task':'load_cellranger_result_to_db',
  480. 'collection_name_key':'sample_igf_id',
  481. 'file_name_task':'run_seurat_for_sc_5p',
  482. 'file_name_key':'seurat_notebook',
  483. 'analysis_name':'seurat_5p',
  484. 'collection_type':'SEURAT_HTML',
  485. 'collection_table':'sample',
  486. 'output_files_key':'output_db_files'})
  487. upload_seurat_report_for_sc_5p_ftp = \
  488. PythonOperator(
  489. task_id='upload_seurat_report_for_sc_5p_ftp',
  490. dag=dag,
  491. queue='hpc_4G',
  492. python_callable=ftp_files_upload_for_analysis,
  493. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  494. 'xcom_pull_files_key':'output_db_files',
  495. 'collection_name_task':'load_cellranger_result_to_db',
  496. 'collection_name_key':'sample_igf_id',
  497. 'collection_type':'FTP_SEURAT_HTML',
  498. 'collection_table':'sample',
  499. 'collect_remote_file':True})
  500. upload_seurat_report_for_sc_5p_to_box = \
  501. PythonOperator(
  502. task_id='upload_seurat_report_for_sc_5p_to_box',
  503. dag=dag,
  504. queue='hpc_4G',
  505. python_callable=upload_analysis_file_to_box,
  506. params={'xcom_pull_task':'load_seurat_report_for_sc_5p_db',
  507. 'xcom_pull_files_key':'output_db_files',
  508. 'analysis_tag':'seurat_single_sample_report'})
  509. ## PIPELINE
  510. decide_analysis_branch >> run_seurat_for_sc_5p
  511. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  512. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  513. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  514. ## TASK
  515. convert_cellranger_bam_to_cram = \
  516. PythonOperator(
  517. task_id='convert_cellranger_bam_to_cram',
  518. dag=dag,
  519. queue='hpc_4G4t',
  520. python_callable=convert_bam_to_cram_func,
  521. params={
  522. 'xcom_pull_files_key': 'cellranger_output',
  523. 'xcom_pull_task': 'run_cellranger',
  524. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  525. 'analysis_description_xcom_key': 'analysis_description',
  526. 'use_ephemeral_space': True,
  527. 'threads': 4,
  528. 'cellranger_bam_path': 'count/sample_alignments.bam',
  529. 'analysis_name': 'cellranger',
  530. 'collection_type': 'ANALYSIS_CRAM',
  531. 'collection_table': 'sample',
  532. 'cram_files_xcom_key': 'cram_files'})
  533. ## PIPELINE
  534. decide_analysis_branch >> convert_cellranger_bam_to_cram
  535. ## TASK
  536. generate_cell_sorted_bam = \
  537. PythonOperator(
  538. task_id='generate_cell_sorted_bam',
  539. dag=dag,
  540. queue='hpc_16G8t',
  541. python_callable=generate_cell_sorted_bam_func,
  542. params={'xcom_pull_task': 'run_cellranger',
  543. 'xcom_pull_files_key': 'cellranger_output',
  544. 'cellranger_bam_path': 'count/sample_alignments.bam',
  545. 'cellsorted_bam_path': 'count/cellsorted_sample_alignments.bam',
  546. 'samtools_mem': '2G',
  547. 'threads': 7})
  548. run_velocyto = \
  549. PythonOperator(
  550. task_id='run_velocyto',
  551. queue='hpc_16G_long',
  552. python_callable=run_velocyto_func,
  553. params={'xcom_pull_task': 'run_cellranger',
  554. 'xcom_pull_files_key': 'cellranger_output',
  555. 'cell_sorted_bam_name': 'count/cellsorted_sample_alignments.bam',
  556. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  557. 'analysis_description_xcom_key': 'analysis_description' })
  558. run_scvelo_for_sc_5p = \
  559. PythonOperator(
  560. task_id='run_scvelo_for_sc_5p',
  561. dag=dag,
  562. queue='hpc_32G16t',
  563. python_callable=run_scvelo_for_sc_5p_func,
  564. params={'xcom_pull_task': 'run_cellranger',
  565. 'xcom_pull_files_key': 'cellranger_output',
  566. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  567. 'analysis_description_xcom_key': 'analysis_description',
  568. 'loom_file_key': 'loom_output',
  569. 'loom_file_task': 'run_velocyto',
  570. 'scanpy_h5ad_task':'run_scanpy_for_sc_5p',
  571. 'scanpy_h5ad_key': 'scanpy_h5ad',
  572. 'timeout': 2400,
  573. 'allow_errors': False,
  574. 'cpu_threads': 14,
  575. 'output_notebook_key': 'scvelo_notebook'})
  576. load_loom_file_to_rds = \
  577. PythonOperator(
  578. task_id='load_loom_file_to_rds',
  579. dag=dag,
  580. queue='hpc_4G',
  581. python_callable=load_analysis_files_func,
  582. params={'collection_name_task':'load_cellranger_result_to_db',
  583. 'collection_name_key':'sample_igf_id',
  584. 'file_name_task':'run_velocyto',
  585. 'file_name_key':'loom_output',
  586. 'analysis_name':'velocyto_5p',
  587. 'collection_type':'VELOCYTO_LOOM',
  588. 'collection_table':'sample',
  589. 'output_files_key':'output_db_files'})
  590. upload_loom_file_to_irods = \
  591. PythonOperator(
  592. task_id='upload_loom_file_to_irods',
  593. dag=dag,
  594. queue='hpc_4G',
  595. python_callable=irods_files_upload_for_analysis,
  596. params={'xcom_pull_task':'load_loom_file_to_rds',
  597. 'xcom_pull_files_key':'output_db_files',
  598. 'collection_name_key':'sample_igf_id',
  599. 'collection_name_task':'load_cellranger_result_to_db',
  600. 'analysis_name':'velocyto_loom'})
  601. load_scvelo_report_to_rds = \
  602. PythonOperator(
  603. task_id='load_scvelo_report_to_rds',
  604. dag=dag,
  605. queue='hpc_4G',
  606. python_callable=load_analysis_files_func,
  607. params={'collection_name_task':'load_cellranger_result_to_db',
  608. 'collection_name_key':'sample_igf_id',
  609. 'file_name_task':'run_scvelo_for_sc_5p',
  610. 'file_name_key':'scvelo_notebook',
  611. 'analysis_name':'scvelo_5p',
  612. 'collection_type':'SCVELO_HTML',
  613. 'collection_table':'sample',
  614. 'output_files_key':'output_db_files'})
  615. upload_scvelo_report_to_ftp = \
  616. PythonOperator(
  617. task_id='upload_scvelo_report_to_ftp',
  618. dag=dag,
  619. queue='hpc_4G',
  620. python_callable=ftp_files_upload_for_analysis,
  621. params={'xcom_pull_task':'load_scvelo_report_to_rds',
  622. 'xcom_pull_files_key':'output_db_files',
  623. 'collection_name_task':'load_cellranger_result_to_db',
  624. 'collection_name_key':'sample_igf_id',
  625. 'collection_type':'FTP_SCVELO_HTML',
  626. 'collection_table':'sample',
  627. 'collect_remote_file':True})
  628. upload_scvelo_report_to_box = \
  629. PythonOperator(
  630. task_id='upload_scvelo_report_to_box',
  631. dag=dag,
  632. queue='hpc_4G',
  633. python_callable=upload_analysis_file_to_box,
  634. params={'xcom_pull_task':'load_scvelo_report_to_rds',
  635. 'xcom_pull_files_key':'output_db_files',
  636. 'analysis_tag':'scvelo_single_sample_report'})
  637. ## PIPELINE
  638. convert_cellranger_bam_to_cram >> generate_cell_sorted_bam
  639. generate_cell_sorted_bam >> run_velocyto
  640. run_velocyto >> run_scvelo_for_sc_5p
  641. run_scanpy_for_sc_5p >> run_scvelo_for_sc_5p
  642. run_velocyto >> load_loom_file_to_rds
  643. load_loom_file_to_rds >> upload_loom_file_to_irods
  644. run_scvelo_for_sc_5p >> load_scvelo_report_to_rds
  645. load_scvelo_report_to_rds >> upload_scvelo_report_to_ftp
  646. load_scvelo_report_to_rds >> upload_scvelo_report_to_box
  647. ## TASK
  648. copy_bam_for_parallel_runs = \
  649. BranchPythonOperator(
  650. task_id='copy_bam_for_parallel_runs',
  651. dag=dag,
  652. queue='hpc_4G',
  653. python_callable=index_and_copy_bam_for_parallel_analysis,
  654. params={
  655. 'xcom_pull_files_key': 'cellranger_output',
  656. 'xcom_pull_task': 'run_cellranger',
  657. 'cellranger_bam_path': 'count/sample_alignments.bam',
  658. 'list_of_tasks': [
  659. 'run_picard_alignment_summary',
  660. 'run_picard_qual_summary',
  661. 'run_picard_rna_summary',
  662. 'run_picard_gc_summary',
  663. 'run_picard_base_dist_summary',
  664. 'run_samtools_stats']})
  665. ## TASK
  666. upload_cram_to_irods = \
  667. PythonOperator(
  668. task_id='upload_cram_to_irods',
  669. dag=dag,
  670. queue='hpc_4G',
  671. python_callable=irods_files_upload_for_analysis,
  672. params={
  673. 'xcom_pull_task': 'convert_cellranger_bam_to_cram',
  674. 'xcom_pull_files_key': 'cram_files',
  675. 'collection_name_key': 'sample_igf_id',
  676. 'collection_name_task': 'load_cellranger_result_to_db',
  677. 'analysis_name': 'cellranger_multi'})
  678. ## PIPELINE
  679. #convert_cellranger_bam_to_cram >> copy_bam_for_parallel_runs # we need to load metrics to cram
  680. generate_cell_sorted_bam >> copy_bam_for_parallel_runs
  681. convert_cellranger_bam_to_cram >> upload_cram_to_irods
  682. ## TASK
  683. run_picard_alignment_summary = \
  684. PythonOperator(
  685. task_id='run_picard_alignment_summary',
  686. dag=dag,
  687. queue='hpc_4G',
  688. python_callable=run_picard_for_cellranger,
  689. params={'xcom_pull_files_key':'run_picard_alignment_summary',
  690. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  691. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  692. 'analysis_description_xcom_key':'analysis_description',
  693. 'use_ephemeral_space':True,
  694. 'load_metrics_to_cram':True,
  695. 'java_param':'-Xmx4g',
  696. 'picard_command':'CollectAlignmentSummaryMetrics',
  697. 'picard_option':{},
  698. 'analysis_files_xcom_key':'picard_alignment_summary',
  699. 'bam_files_xcom_key':None})
  700. ## PIPELINE
  701. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  702. ## TASK
  703. cleanup_picard_alignment_summary_input = \
  704. PythonOperator(
  705. task_id='cleanup_picard_alignment_summary_input',
  706. dag=dag,
  707. queue='hpc_4G',
  708. python_callable=clean_up_files,
  709. params={'xcom_pull_files_key':'run_picard_alignment_summary',
  710. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  711. ## PIPELINE
  712. run_picard_alignment_summary >> cleanup_picard_alignment_summary_input
  713. ## TASK
  714. upload_picard_alignment_summary_to_box = \
  715. PythonOperator(
  716. task_id='upload_picard_alignment_summary_to_box',
  717. dag=dag,
  718. queue='hpc_4G',
  719. python_callable=upload_analysis_file_to_box,
  720. params={'xcom_pull_task':'run_picard_alignment_summary',
  721. 'xcom_pull_files_key':'picard_alignment_summary',
  722. 'analysis_tag':'Picard-CollectAlignmentSummaryMetrics'})
  723. ## PIPELINE
  724. run_picard_alignment_summary >> upload_picard_alignment_summary_to_box
  725. ## TASK
  726. run_picard_qual_summary = \
  727. PythonOperator(
  728. task_id='run_picard_qual_summary',
  729. dag=dag,
  730. queue='hpc_4G',
  731. python_callable=run_picard_for_cellranger,
  732. params={'xcom_pull_files_key':'run_picard_qual_summary',
  733. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  734. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  735. 'analysis_description_xcom_key':'analysis_description',
  736. 'use_ephemeral_space':True,
  737. 'load_metrics_to_cram':True,
  738. 'java_param':'-Xmx4g',
  739. 'picard_command':'QualityScoreDistribution',
  740. 'picard_option':{},
  741. 'analysis_files_xcom_key':'picard_qual_summary',
  742. 'bam_files_xcom_key':None})
  743. ## PIPELINE
  744. copy_bam_for_parallel_runs >> run_picard_qual_summary
  745. ## TASK
  746. cleanup_picard_qual_summary_input = \
  747. PythonOperator(
  748. task_id='cleanup_picard_qual_summary_input',
  749. dag=dag,
  750. queue='hpc_4G',
  751. python_callable=clean_up_files,
  752. params={'xcom_pull_files_key':'run_picard_qual_summary',
  753. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  754. ## PIPELINE
  755. run_picard_qual_summary >> cleanup_picard_qual_summary_input
  756. ## TASK
  757. upload_picard_qual_summary_to_box = \
  758. PythonOperator(
  759. task_id='upload_picard_qual_summary_to_box',
  760. dag=dag,
  761. queue='hpc_4G',
  762. python_callable=upload_analysis_file_to_box,
  763. params={'xcom_pull_task':'run_picard_qual_summary',
  764. 'xcom_pull_files_key':'picard_qual_summary',
  765. 'analysis_tag':'Picard-QualityScoreDistribution'})
  766. ## PIPELINE
  767. run_picard_qual_summary >> upload_picard_qual_summary_to_box
  768. ## TASK
  769. run_picard_rna_summary = \
  770. PythonOperator(
  771. task_id='run_picard_rna_summary',
  772. dag=dag,
  773. queue='hpc_8G',
  774. python_callable=run_picard_for_cellranger,
  775. params={'xcom_pull_files_key':'run_picard_rna_summary',
  776. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  777. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  778. 'analysis_description_xcom_key':'analysis_description',
  779. 'use_ephemeral_space':True,
  780. 'load_metrics_to_cram':True,
  781. 'java_param':'-Xmx7g',
  782. 'picard_command':'CollectRnaSeqMetrics',
  783. 'picard_option':{},
  784. 'analysis_files_xcom_key':'picard_rna_summary',
  785. 'bam_files_xcom_key':None})
  786. ## PIPELINE
  787. copy_bam_for_parallel_runs >> run_picard_rna_summary
  788. ## TASK
  789. cleanup_picard_rna_summary_input = \
  790. PythonOperator(
  791. task_id='cleanup_picard_rna_summary_input',
  792. dag=dag,
  793. queue='hpc_4G',
  794. python_callable=clean_up_files,
  795. params={'xcom_pull_files_key':'run_picard_rna_summary',
  796. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  797. ## PIPELINE
  798. run_picard_rna_summary >> cleanup_picard_rna_summary_input
  799. ## TASK
  800. upload_picard_rna_summary_to_box = \
  801. PythonOperator(
  802. task_id='upload_picard_rna_summary_to_box',
  803. dag=dag,
  804. queue='hpc_4G',
  805. python_callable=upload_analysis_file_to_box,
  806. params={'xcom_pull_task':'run_picard_rna_summary',
  807. 'xcom_pull_files_key':'picard_rna_summary',
  808. 'analysis_tag':'Picard-CollectRnaSeqMetrics'})
  809. ## PIPELINE
  810. run_picard_rna_summary >> upload_picard_rna_summary_to_box
  811. ## TASK
  812. run_picard_gc_summary = \
  813. PythonOperator(
  814. task_id='run_picard_gc_summary',
  815. dag=dag,
  816. queue='hpc_4G',
  817. python_callable=run_picard_for_cellranger,
  818. params={'xcom_pull_files_key':'run_picard_gc_summary',
  819. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  820. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  821. 'analysis_description_xcom_key':'analysis_description',
  822. 'use_ephemeral_space':True,
  823. 'load_metrics_to_cram':True,
  824. 'java_param':'-Xmx4g',
  825. 'picard_command':'CollectGcBiasMetrics',
  826. 'picard_option':{},
  827. 'analysis_files_xcom_key':'picard_gc_summary',
  828. 'bam_files_xcom_key':None})
  829. ## PIPELINE
  830. copy_bam_for_parallel_runs >> run_picard_gc_summary
  831. ## TASK
  832. cleanup_picard_gc_summary_input = \
  833. PythonOperator(
  834. task_id='cleanup_picard_gc_summary_input',
  835. dag=dag,
  836. queue='hpc_4G',
  837. python_callable=clean_up_files,
  838. params={'xcom_pull_files_key':'run_picard_gc_summary',
  839. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  840. ## PIPELINE
  841. run_picard_gc_summary >> cleanup_picard_gc_summary_input
  842. ## TASK
  843. upload_picard_gc_summary_to_box = \
  844. PythonOperator(
  845. task_id='upload_picard_gc_summary_to_box',
  846. dag=dag,
  847. queue='hpc_4G',
  848. python_callable=upload_analysis_file_to_box,
  849. params={'xcom_pull_task':'run_picard_gc_summary',
  850. 'xcom_pull_files_key':'picard_gc_summary',
  851. 'analysis_tag':'Picard-CollectGcBiasMetrics'})
  852. ## PIPELINE
  853. run_picard_gc_summary >> upload_picard_gc_summary_to_box
  854. ## TASK
  855. run_picard_base_dist_summary = \
  856. PythonOperator(
  857. task_id='run_picard_base_dist_summary',
  858. dag=dag,
  859. queue='hpc_4G',
  860. python_callable=run_picard_for_cellranger,
  861. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  862. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  863. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  864. 'analysis_description_xcom_key':'analysis_description',
  865. 'use_ephemeral_space':True,
  866. 'load_metrics_to_cram':True,
  867. 'java_param':'-Xmx4g',
  868. 'picard_command':'CollectBaseDistributionByCycle',
  869. 'picard_option':{},
  870. 'analysis_files_xcom_key':'picard_base_summary',
  871. 'bam_files_xcom_key':None})
  872. ## PIPELINE
  873. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  874. ## TASK
  875. cleanup_picard_base_dist_summary_input = \
  876. PythonOperator(
  877. task_id='cleanup_picard_base_dist_summary_input',
  878. dag=dag,
  879. queue='hpc_4G',
  880. python_callable=clean_up_files,
  881. params={'xcom_pull_files_key':'run_picard_base_dist_summary',
  882. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  883. ## PIPELINE
  884. run_picard_base_dist_summary >> cleanup_picard_base_dist_summary_input
  885. ## TASK
  886. upload_picard_base_dist_summary_to_box = \
  887. PythonOperator(
  888. task_id='upload_picard_base_dist_summary_to_box',
  889. dag=dag,
  890. queue='hpc_4G',
  891. python_callable=upload_analysis_file_to_box,
  892. params={'xcom_pull_task':'run_picard_base_dist_summary',
  893. 'xcom_pull_files_key':'picard_base_summary',
  894. 'analysis_tag':'Picard-CollectBaseDistributionByCycle'})
  895. ## PIPELINE
  896. run_picard_base_dist_summary >> upload_picard_base_dist_summary_to_box
  897. ## TASK
  898. run_samtools_stats = \
  899. PythonOperator(
  900. task_id='run_samtools_stats',
  901. dag=dag,
  902. queue='hpc_4G4t',
  903. python_callable=run_samtools_for_cellranger,
  904. params={'xcom_pull_files_key':'run_samtools_stats',
  905. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  906. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  907. 'analysis_description_xcom_key':'analysis_description',
  908. 'use_ephemeral_space':True,
  909. 'load_metrics_to_cram':True,
  910. 'samtools_command':'stats',
  911. 'threads':4,
  912. 'analysis_files_xcom_key':'samtools_stats'})
  913. ## PIPELINE
  914. copy_bam_for_parallel_runs >> run_samtools_stats
  915. ## TASK
  916. upload_samtools_stats_to_box = \
  917. PythonOperator(
  918. task_id='upload_samtools_stats_to_box',
  919. dag=dag,
  920. queue='hpc_4G',
  921. python_callable=upload_analysis_file_to_box,
  922. params={'xcom_pull_task':'run_samtools_stats',
  923. 'xcom_pull_files_key':'samtools_stats',
  924. 'analysis_tag':'Samtools-stats'})
  925. ## PIPELINE
  926. run_samtools_stats >> upload_samtools_stats_to_box
  927. ## TASK
  928. run_samtools_idxstats = \
  929. PythonOperator(
  930. task_id='run_samtools_idxstats',
  931. dag=dag,
  932. queue='hpc_4G4t',
  933. python_callable=run_samtools_for_cellranger,
  934. params={'xcom_pull_files_key':'run_samtools_stats',
  935. 'xcom_pull_task':'copy_bam_for_parallel_runs',
  936. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  937. 'analysis_description_xcom_key':'analysis_description',
  938. 'use_ephemeral_space':True,
  939. 'load_metrics_to_cram':True,
  940. 'samtools_command':'idxstats',
  941. 'threads':4,
  942. 'analysis_files_xcom_key':'samtools_idxstats'})
  943. ## PIPELINE
  944. run_samtools_stats >> run_samtools_idxstats
  945. ## TASK
  946. cleanup_samtools_stats_input = \
  947. PythonOperator(
  948. task_id='cleanup_samtools_stats_input',
  949. dag=dag,
  950. queue='hpc_4G',
  951. python_callable=clean_up_files,
  952. params={'xcom_pull_files_key':'run_samtools_stats',
  953. 'xcom_pull_task':'copy_bam_for_parallel_runs'})
  954. ## PIPELINE
  955. run_samtools_idxstats >> cleanup_samtools_stats_input
  956. ## TASK
  957. upload_samtools_idxstats_to_box = \
  958. PythonOperator(
  959. task_id='upload_samtools_idxstats_to_box',
  960. dag=dag,
  961. queue='hpc_4G',
  962. python_callable=upload_analysis_file_to_box,
  963. params={'xcom_pull_task':'run_samtools_idxstats',
  964. 'xcom_pull_files_key':'samtools_idxstats',
  965. 'analysis_tag':'Samtools-idxstats'})
  966. ## PIPELINE
  967. run_samtools_idxstats >> upload_samtools_idxstats_to_box
  968. ## TASK
  969. run_multiqc = \
  970. PythonOperator(
  971. task_id='run_multiqc',
  972. dag=dag,
  973. queue='hpc_4G',
  974. trigger_rule='none_failed_or_skipped',
  975. python_callable=run_multiqc_for_cellranger,
  976. params={
  977. 'list_of_analysis_xcoms_and_tasks':{
  978. 'run_cellranger':'cellranger_output',
  979. 'run_picard_alignment_summary':'picard_alignment_summary',
  980. 'run_picard_qual_summary':'picard_qual_summary',
  981. 'run_picard_rna_summary':'picard_rna_summary',
  982. 'run_picard_gc_summary':'picard_gc_summary',
  983. 'run_picard_base_dist_summary':'picard_base_summary',
  984. 'run_samtools_stats':'samtools_stats',
  985. 'run_samtools_idxstats':'samtools_idxstats'},
  986. 'analysis_description_xcom_pull_task':'fetch_analysis_info',
  987. 'analysis_description_xcom_key':'analysis_description',
  988. 'use_ephemeral_space':True,
  989. 'multiqc_html_file_xcom_key':'multiqc_html',
  990. 'multiqc_data_file_xcom_key':'multiqc_data',
  991. 'tool_order_list':['picad','samtools']})
  992. ## PIPELINE
  993. run_picard_alignment_summary >> run_multiqc
  994. run_picard_qual_summary >> run_multiqc
  995. run_picard_rna_summary >> run_multiqc
  996. run_picard_gc_summary >> run_multiqc
  997. run_picard_base_dist_summary >> run_multiqc
  998. run_samtools_idxstats >> run_multiqc
  999. ## TASK
  1000. load_multiqc_html = \
  1001. PythonOperator(
  1002. task_id='load_multiqc_html',
  1003. dag=dag,
  1004. queue='hpc_4G',
  1005. python_callable=load_analysis_files_func,
  1006. params={'collection_name_task':'load_cellranger_result_to_db',
  1007. 'collection_name_key':'sample_igf_id',
  1008. 'file_name_task':'run_multiqc',
  1009. 'file_name_key':'multiqc_html',
  1010. 'analysis_name':'multiqc',
  1011. 'collection_type':'MULTIQC_HTML',
  1012. 'collection_table':'sample',
  1013. 'output_files_key':'output_db_files'})
  1014. ## PIPELINE
  1015. run_multiqc >> load_multiqc_html
  1016. ## TASK
  1017. upload_multiqc_to_ftp = \
  1018. PythonOperator(
  1019. task_id='upload_multiqc_to_ftp',
  1020. dag=dag,
  1021. queue='hpc_4G',
  1022. python_callable=ftp_files_upload_for_analysis,
  1023. params={'xcom_pull_task':'load_multiqc_html',
  1024. 'xcom_pull_files_key':'output_db_files',
  1025. 'collection_name_task':'load_cellranger_result_to_db',
  1026. 'collection_name_key':'sample_igf_id',
  1027. 'collection_type':'FTP_MULTIQC_HTML',
  1028. 'collection_table':'sample',
  1029. 'collect_remote_file':True})
  1030. ## PIPELINE
  1031. load_multiqc_html >> upload_multiqc_to_ftp
  1032. ## TASK
  1033. upload_multiqc_to_box = \
  1034. PythonOperator(
  1035. task_id='upload_multiqc_to_box',
  1036. dag=dag,
  1037. queue='hpc_4G',
  1038. python_callable=upload_analysis_file_to_box,
  1039. params={'xcom_pull_task':'load_multiqc_html',
  1040. 'xcom_pull_files_key':'output_db_files',
  1041. 'analysis_tag':'multiqc_report'})
  1042. ## PIPELINE
  1043. load_multiqc_html >> upload_multiqc_to_box
  1044. ## TASK
  1045. update_analysis_and_status = \
  1046. PythonOperator(
  1047. task_id='update_analysis_and_status',
  1048. dag=dag,
  1049. queue='hpc_4G',
  1050. python_callable=change_pipeline_status,
  1051. trigger_rule='none_failed_or_skipped',
  1052. params={'new_status':'FINISHED',
  1053. 'no_change_status':'SEEDED'})
  1054. ## PIPELINE
  1055. upload_multiqc_to_ftp >> update_analysis_and_status
  1056. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  1057. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  1058. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  1059. #upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status # no more ambiguous VDJ
  1060. #upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  1061. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  1062. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  1063. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  1064. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  1065. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  1066. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  1067. upload_cellranger_results_to_irods >> update_analysis_and_status
  1068. upload_cellranger_report_to_ftp >> update_analysis_and_status
  1069. upload_cellranger_report_to_box >> update_analysis_and_status
  1070. upload_cram_to_irods >> update_analysis_and_status
  1071. upload_scvelo_report_to_box >> update_analysis_and_status
  1072. upload_scvelo_report_to_ftp >> update_analysis_and_status
  1073. ## TASK
  1074. update_qc_pages = \
  1075. PythonOperator(
  1076. task_id='update_qc_pages',
  1077. dag=dag,
  1078. queue='hpc_4G',
  1079. python_callable=create_and_update_qc_pages,
  1080. params={'use_ephemeral_space':True,
  1081. 'collection_type_list':[
  1082. 'FTP_MULTIQC_HTML',
  1083. 'FTP_SEURAT_HTML',
  1084. 'FTP_SCIRPY_VDJ_T_HTML',
  1085. 'FTP_SCIRPY_VDJ_B_HTML',
  1086. 'FTP_SCIRPY_VDJ_HTML',
  1087. 'FTP_CELLBROWSER',
  1088. 'FTP_SCANPY_HTML',
  1089. 'FTP_SCVELO_HTML',
  1090. 'FTP_CELLRANGER_HTML']})
  1091. ## PIPELINE
  1092. update_analysis_and_status >> update_qc_pages