dag9_tenx_single_cell_immune_profiling.py 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158
  1. from datetime import timedelta
  2. import os,json,logging,subprocess
  3. from airflow.models import DAG,Variable
  4. from airflow.utils.dates import days_ago
  5. from airflow.operators.python_operator import PythonOperator
  6. from airflow.operators.python_operator import BranchPythonOperator
  7. from airflow.operators.dummy_operator import DummyOperator
  8. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import fetch_analysis_info_and_branch_func
  9. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import configure_cellranger_run_func
  10. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_sc_read_trimmming_func
  11. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_cellranger_tool
  12. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import decide_analysis_branch_func
  13. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_result_to_db_func
  14. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import ftp_files_upload_for_analysis
  15. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import irods_files_upload_for_analysis
  16. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_singlecell_notebook_wrapper_func
  17. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_analysis_files_func
  18. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import task_branch_function
  19. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import upload_analysis_file_to_box
  20. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import convert_bam_to_cram_func
  21. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_picard_for_cellranger
  22. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_samtools_for_cellranger
  23. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_multiqc_for_cellranger
  24. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import index_and_copy_bam_for_parallel_analysis
  25. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import change_pipeline_status
  26. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import clean_up_files
  27. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import create_and_update_qc_pages
  28. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import load_cellranger_metrices_to_collection
  29. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import generate_cell_sorted_bam_func
  30. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_velocyto_func
  31. from igf_airflow.utils.dag9_tenx_single_cell_immune_profiling_utils import run_scvelo_for_sc_5p_func
  32. ## ARGS
  33. default_args = {
  34. 'owner': 'airflow',
  35. 'depends_on_past': False,
  36. 'start_date': days_ago(2),
  37. 'email_on_failure': False,
  38. 'email_on_retry': False,
  39. 'retries': 4,
  40. 'catchup': True,
  41. 'retry_delay': timedelta(minutes=5),
  42. 'provide_context': True,
  43. }
  44. FEATURE_TYPE_LIST = \
  45. Variable.get('tenx_single_cell_immune_profiling_feature_types', default_var={})
  46. ## DAG
  47. dag = \
  48. DAG(
  49. dag_id='dag9_tenx_single_cell_immune_profiling',
  50. schedule_interval=None,
  51. tags=['hpc', 'analysis', 'tenx', 'sc'],
  52. default_args=default_args,
  53. concurrency=100,
  54. max_active_runs=5,
  55. orientation='LR')
  56. with dag:
  57. ## TASK
  58. fetch_analysis_info_and_branch = \
  59. BranchPythonOperator(
  60. task_id='fetch_analysis_info',
  61. dag=dag,
  62. queue='hpc_4G',
  63. params={
  64. 'no_analysis_task': 'no_analysis',
  65. 'analysis_description_xcom_key': 'analysis_description',
  66. 'analysis_info_xcom_key': 'analysis_info'},
  67. python_callable=fetch_analysis_info_and_branch_func)
  68. ## TASK
  69. configure_cellranger_run = \
  70. PythonOperator(
  71. task_id='configure_cellranger_run',
  72. dag=dag,
  73. queue='hpc_4G',
  74. trigger_rule='none_failed_or_skipped',
  75. params={
  76. 'xcom_pull_task_id': 'fetch_analysis_info',
  77. 'analysis_description_xcom_key': 'analysis_description',
  78. 'analysis_info_xcom_key': 'analysis_info',
  79. 'library_csv_xcom_key': 'cellranger_library_csv'},
  80. python_callable=configure_cellranger_run_func)
  81. for analysis_name in FEATURE_TYPE_LIST.keys():
  82. ## TASK
  83. task_branch = \
  84. BranchPythonOperator(
  85. task_id=analysis_name,
  86. dag=dag,
  87. queue='hpc_4G',
  88. params={
  89. 'xcom_pull_task_id': 'fetch_analysis_info',
  90. 'analysis_info_xcom_key': 'analysis_info',
  91. 'analysis_name': analysis_name,
  92. 'task_prefix': 'run_trim'},
  93. python_callable=task_branch_function)
  94. run_trim_list = list()
  95. for run_id in range(0,10):
  96. ## TASK
  97. t = \
  98. PythonOperator(
  99. task_id='run_trim_{0}_{1}'.format(analysis_name, run_id),
  100. dag=dag,
  101. queue='hpc_4G',
  102. params={
  103. 'xcom_pull_task_id': 'fetch_analysis_info',
  104. 'analysis_info_xcom_key': 'analysis_info',
  105. 'analysis_description_xcom_key': 'analysis_description',
  106. 'analysis_name': analysis_name,
  107. 'run_id': run_id,
  108. 'r1-length': 0,
  109. 'r2-length': 0,
  110. 'fastq_input_dir_tag': 'fastq_dir',
  111. 'use_ephemeral_space': True,
  112. 'fastq_output_dir_tag': 'output_path'},
  113. python_callable=run_sc_read_trimmming_func)
  114. run_trim_list.append(t)
  115. ## TASK
  116. collect_trimmed_files = \
  117. DummyOperator(
  118. task_id='collect_trimmed_files_{0}'.format(analysis_name),
  119. trigger_rule='none_failed_or_skipped',
  120. dag=dag)
  121. ## PIPELINE
  122. fetch_analysis_info_and_branch >> task_branch
  123. task_branch >> run_trim_list
  124. run_trim_list >> collect_trimmed_files
  125. collect_trimmed_files >> configure_cellranger_run
  126. ## TASK
  127. no_analysis = \
  128. DummyOperator(
  129. task_id='no_analysis',
  130. dag=dag)
  131. ## PIPELINE
  132. fetch_analysis_info_and_branch >> no_analysis
  133. ## TASK
  134. run_cellranger = \
  135. PythonOperator(
  136. task_id='run_cellranger',
  137. dag=dag,
  138. queue='hpc_64G16t24hr',
  139. params={
  140. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  141. 'analysis_description_xcom_key': 'analysis_description',
  142. 'library_csv_xcom_key': 'cellranger_library_csv',
  143. 'library_csv_xcom_pull_task': 'configure_cellranger_run',
  144. 'cellranger_xcom_key': 'cellranger_output',
  145. 'cellranger_options': ['--localcores 14', '--localmem 60']},
  146. python_callable=run_cellranger_tool)
  147. ## PIPELINE
  148. configure_cellranger_run >> run_cellranger
  149. ## TASK
  150. decide_analysis_branch = \
  151. BranchPythonOperator(
  152. task_id='decide_analysis_branch',
  153. dag=dag,
  154. queue='hpc_4G',
  155. python_callable=decide_analysis_branch_func,
  156. params={
  157. 'load_cellranger_result_to_db_task': 'load_cellranger_result_to_db',
  158. 'run_scanpy_for_sc_5p_task': 'run_scanpy_for_sc_5p',
  159. 'run_scirpy_for_vdj_task': 'run_scirpy_for_vdj',
  160. 'run_scirpy_for_vdj_b_task': 'run_scirpy_for_vdj_b',
  161. 'run_scirpy_vdj_t_task': 'run_scirpy_for_vdj_t',
  162. 'run_seurat_for_sc_5p_task': 'run_seurat_for_sc_5p',
  163. 'convert_cellranger_bam_to_cram_task': 'convert_cellranger_bam_to_cram',
  164. 'library_csv_xcom_key': 'cellranger_library_csv',
  165. 'library_csv_xcom_pull_task': 'configure_cellranger_run'})
  166. ## PIPELINE
  167. run_cellranger >> decide_analysis_branch
  168. ## TASK
  169. load_cellranger_result_to_db = \
  170. PythonOperator(
  171. task_id='load_cellranger_result_to_db',
  172. dag=dag,
  173. queue='hpc_4G',
  174. python_callable=load_cellranger_result_to_db_func,
  175. params={
  176. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  177. 'analysis_description_xcom_key': 'analysis_description',
  178. 'cellranger_xcom_key': 'cellranger_output',
  179. 'cellranger_xcom_pull_task': 'run_cellranger',
  180. 'collection_type': 'CELLRANGER_MULTI',
  181. 'html_collection_type': 'CELLRANGER_HTML',
  182. 'collection_table': 'sample',
  183. 'xcom_collection_name_key': 'sample_igf_id',
  184. 'genome_column': 'genome_build',
  185. 'analysis_name': 'cellranger_multi',
  186. 'output_xcom_key': 'loaded_output_files',
  187. 'html_xcom_key': 'html_report_file',
  188. 'html_report_file_name': 'web_summary.html'})
  189. upload_cellranger_report_to_ftp = \
  190. PythonOperator(
  191. task_id='upload_cellranger_report_to_ftp',
  192. dag=dag,
  193. queue='hpc_4G',
  194. python_callable=ftp_files_upload_for_analysis,
  195. params={
  196. 'xcom_pull_task': 'load_cellranger_result_to_db',
  197. 'xcom_pull_files_key': 'html_report_file',
  198. 'collection_name_task': 'load_cellranger_result_to_db',
  199. 'collection_name_key': 'sample_igf_id',
  200. 'collection_type': 'FTP_CELLRANGER_HTML',
  201. 'collection_table': 'sample',
  202. 'collect_remote_file': True})
  203. upload_cellranger_report_to_box = \
  204. PythonOperator(
  205. task_id='upload_cellranger_report_to_box',
  206. dag=dag,
  207. queue='hpc_4G',
  208. python_callable=upload_analysis_file_to_box,
  209. params={
  210. 'xcom_pull_task': 'load_cellranger_result_to_db',
  211. 'xcom_pull_files_key': 'html_report_file',
  212. 'analysis_tag': 'cellranger_multi'})
  213. upload_cellranger_results_to_irods = \
  214. PythonOperator(
  215. task_id='upload_cellranger_results_to_irods',
  216. dag=dag,
  217. queue='hpc_4G',
  218. python_callable=irods_files_upload_for_analysis,
  219. params={
  220. 'xcom_pull_task': 'load_cellranger_result_to_db',
  221. 'xcom_pull_files_key': 'loaded_output_files',
  222. 'collection_name_key': 'sample_igf_id',
  223. 'collection_name_task': 'load_cellranger_result_to_db',
  224. 'analysis_name': 'cellranger_multi'})
  225. ## PIPELINE
  226. decide_analysis_branch >> load_cellranger_result_to_db
  227. load_cellranger_result_to_db >> upload_cellranger_report_to_ftp
  228. load_cellranger_result_to_db >> upload_cellranger_report_to_box
  229. load_cellranger_result_to_db >> upload_cellranger_results_to_irods
  230. ## TASK
  231. run_scanpy_for_sc_5p = \
  232. PythonOperator(
  233. task_id='run_scanpy_for_sc_5p',
  234. dag=dag,
  235. queue='hpc_4G',
  236. python_callable=run_singlecell_notebook_wrapper_func,
  237. params={
  238. 'cellranger_xcom_key': 'cellranger_output',
  239. 'cellranger_xcom_pull_task': 'run_cellranger',
  240. 'scanpy_timeout': 1200,
  241. 'allow_errors': False,
  242. 'kernel_name': 'python3',
  243. 'count_dir': 'count',
  244. 'analysis_name': 'scanpy',
  245. 'output_notebook_key': 'scanpy_notebook',
  246. 'output_cellbrowser_key': 'cellbrowser_dirs',
  247. 'output_scanpy_h5ad_key': 'scanpy_h5ad',
  248. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  249. 'analysis_description_xcom_key': 'analysis_description'})
  250. load_cellranger_gex_matrics_to_db = \
  251. PythonOperator(
  252. task_id='load_cellranger_gex_matrics_to_db',
  253. dag=dag,
  254. queue='hpc_4G',
  255. python_callable=load_cellranger_metrices_to_collection,
  256. params={
  257. 'cellranger_xcom_key': 'cellranger_output',
  258. 'cellranger_xcom_pull_task': 'run_cellranger',
  259. 'collection_type': 'CELLRANGER_MULTI',
  260. 'collection_name_task': 'load_cellranger_result_to_db',
  261. 'collection_name_key': 'sample_igf_id',
  262. 'metrics_summary_file': 'metrics_summary.csv',
  263. 'attribute_prefix': 'CELLRANGER_COUNT'})
  264. load_scanpy_report_for_sc_5p_to_db = \
  265. PythonOperator(
  266. task_id='load_scanpy_report_for_sc_5p_to_db',
  267. dag=dag,
  268. queue='hpc_4G',
  269. python_callable=load_analysis_files_func,
  270. params={
  271. 'collection_name_task': 'load_cellranger_result_to_db',
  272. 'collection_name_key': 'sample_igf_id',
  273. 'file_name_task': 'run_scanpy_for_sc_5p',
  274. 'file_name_key': 'scanpy_notebook',
  275. 'analysis_name': 'scanpy_5p',
  276. 'collection_type': 'SCANPY_HTML',
  277. 'collection_table': 'sample',
  278. 'output_files_key': 'output_db_files'})
  279. upload_scanpy_report_for_sc_5p_to_ftp = \
  280. PythonOperator(
  281. task_id='upload_scanpy_report_for_sc_5p_to_ftp',
  282. dag=dag,
  283. queue='hpc_4G',
  284. python_callable=ftp_files_upload_for_analysis,
  285. params={
  286. 'xcom_pull_task': 'load_scanpy_report_for_sc_5p_to_db',
  287. 'xcom_pull_files_key': 'output_db_files',
  288. 'collection_name_task': 'load_cellranger_result_to_db',
  289. 'collection_name_key': 'sample_igf_id',
  290. 'collection_type': 'FTP_SCANPY_HTML',
  291. 'collection_table': 'sample',
  292. 'collect_remote_file': True})
  293. upload_scanpy_report_for_sc_5p_to_box = \
  294. PythonOperator(
  295. task_id='upload_scanpy_report_for_sc_5p_to_box',
  296. dag=dag,
  297. queue='hpc_4G',
  298. python_callable=upload_analysis_file_to_box,
  299. params={
  300. 'xcom_pull_task': 'load_scanpy_report_for_sc_5p_to_db',
  301. 'xcom_pull_files_key': 'output_db_files',
  302. 'analysis_tag': 'scanpy_single_sample_report'})
  303. upload_cellbrowser_for_sc_5p_to_ftp = \
  304. PythonOperator(
  305. task_id='upload_cellbrowser_for_sc_5p_to_ftp',
  306. dag=dag,
  307. queue='hpc_4G',
  308. python_callable=ftp_files_upload_for_analysis,
  309. params={
  310. 'xcom_pull_task': 'run_scanpy_for_sc_5p',
  311. 'xcom_pull_files_key': 'cellbrowser_dirs',
  312. 'collection_name_task': 'load_cellranger_result_to_db',
  313. 'collection_name_key': 'sample_igf_id',
  314. 'collection_type': 'FTP_CELLBROWSER',
  315. 'collection_table': 'sample',
  316. 'collect_remote_file': True})
  317. ## PIPELINE
  318. decide_analysis_branch >> run_scanpy_for_sc_5p
  319. run_scanpy_for_sc_5p >> load_scanpy_report_for_sc_5p_to_db
  320. run_scanpy_for_sc_5p >> load_cellranger_gex_matrics_to_db
  321. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_ftp
  322. load_scanpy_report_for_sc_5p_to_db >> upload_scanpy_report_for_sc_5p_to_box
  323. run_scanpy_for_sc_5p >> upload_cellbrowser_for_sc_5p_to_ftp
  324. ## TASK
  325. run_scirpy_for_vdj_b = \
  326. PythonOperator(
  327. task_id='run_scirpy_for_vdj_b',
  328. dag=dag,
  329. queue='hpc_4G',
  330. python_callable=run_singlecell_notebook_wrapper_func,
  331. params={
  332. 'cellranger_xcom_key': 'cellranger_output',
  333. 'cellranger_xcom_pull_task': 'run_cellranger',
  334. 'scanpy_timeout': 1200,
  335. 'allow_errors': False,
  336. 'kernel_name': 'python3',
  337. 'analysis_name': 'scirpy',
  338. 'vdj_dir': 'vdj_b',
  339. 'count_dir': 'count',
  340. 'output_notebook_key': 'scirpy_notebook',
  341. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  342. 'analysis_description_xcom_key': 'analysis_description'})
  343. load_cellranger_vdjB_matrics_to_db = \
  344. PythonOperator(
  345. task_id='load_cellranger_vdjB_matrics_to_db',
  346. dag=dag,
  347. queue='hpc_4G',
  348. python_callable=load_cellranger_metrices_to_collection,
  349. params={
  350. 'cellranger_xcom_key': 'cellranger_output',
  351. 'cellranger_xcom_pull_task': 'run_cellranger',
  352. 'collection_type': 'CELLRANGER_MULTI',
  353. 'collection_name_task': 'load_cellranger_result_to_db',
  354. 'collection_name_key': 'sample_igf_id',
  355. 'metrics_summary_file': 'metrics_summary.csv',
  356. 'attribute_prefix': 'CELLRANGER_VDJB'})
  357. load_scirpy_report_for_vdj_b_to_db = \
  358. PythonOperator(
  359. task_id='load_scirpy_report_for_vdj_b_to_db',
  360. dag=dag,
  361. queue='hpc_4G',
  362. python_callable=load_analysis_files_func,
  363. params={
  364. 'collection_name_task': 'load_cellranger_result_to_db',
  365. 'collection_name_key': 'sample_igf_id',
  366. 'file_name_task': 'run_scirpy_for_vdj_b',
  367. 'file_name_key': 'scirpy_notebook',
  368. 'analysis_name': 'scirpy_vdj_b',
  369. 'collection_type': 'SCIRPY_VDJ_B_HTML',
  370. 'collection_table': 'sample',
  371. 'output_files_key': 'output_db_files'})
  372. upload_scirpy_report_for_vdj_b_to_ftp = \
  373. PythonOperator(
  374. task_id='upload_scirpy_report_for_vdj_b_to_ftp',
  375. dag=dag,
  376. queue='hpc_4G',
  377. python_callable=ftp_files_upload_for_analysis,
  378. params={
  379. 'xcom_pull_task': 'load_scirpy_report_for_vdj_b_to_db',
  380. 'xcom_pull_files_key': 'output_db_files',
  381. 'collection_name_task': 'load_cellranger_result_to_db',
  382. 'collection_name_key': 'sample_igf_id',
  383. 'collection_type': 'FTP_SCIRPY_VDJ_B_HTML',
  384. 'collection_table': 'sample',
  385. 'collect_remote_file': True})
  386. upload_scirpy_report_for_vdj_b_to_box = \
  387. PythonOperator(
  388. task_id='upload_scanpy_report_for_vdj_b_to_box',
  389. dag=dag,
  390. queue='hpc_4G',
  391. python_callable=upload_analysis_file_to_box,
  392. params={
  393. 'xcom_pull_task': 'load_scirpy_report_for_vdj_b_to_db',
  394. 'xcom_pull_files_key': 'output_db_files',
  395. 'analysis_tag': 'scirpy_vdj_b_single_sample_report'})
  396. ## PIPELINE
  397. decide_analysis_branch >> run_scirpy_for_vdj_b
  398. run_scirpy_for_vdj_b >> load_scirpy_report_for_vdj_b_to_db
  399. run_scirpy_for_vdj_b >> load_cellranger_vdjB_matrics_to_db
  400. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_ftp
  401. load_scirpy_report_for_vdj_b_to_db >> upload_scirpy_report_for_vdj_b_to_box
  402. ## TASK
  403. run_scirpy_for_vdj_t = \
  404. PythonOperator(
  405. task_id='run_scirpy_for_vdj_t',
  406. dag=dag,
  407. queue='hpc_4G',
  408. python_callable=run_singlecell_notebook_wrapper_func,
  409. params={
  410. 'cellranger_xcom_key': 'cellranger_output',
  411. 'cellranger_xcom_pull_task': 'run_cellranger',
  412. 'scanpy_timeout': 1200,
  413. 'allow_errors': False,
  414. 'kernel_name': 'python3',
  415. 'analysis_name': 'scirpy',
  416. 'vdj_dir': 'vdj_t',
  417. 'count_dir': 'count',
  418. 'output_notebook_key': 'scirpy_notebook',
  419. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  420. 'analysis_description_xcom_key': 'analysis_description'})
  421. load_cellranger_vdjT_matrics_to_db = \
  422. PythonOperator(
  423. task_id='load_cellranger_vdjT_matrics_to_db',
  424. dag=dag,
  425. queue='hpc_4G',
  426. python_callable=load_cellranger_metrices_to_collection,
  427. params={
  428. 'cellranger_xcom_key': 'cellranger_output',
  429. 'cellranger_xcom_pull_task': 'run_cellranger',
  430. 'collection_type': 'CELLRANGER_MULTI',
  431. 'collection_name_task': 'load_cellranger_result_to_db',
  432. 'collection_name_key': 'sample_igf_id',
  433. 'metrics_summary_file': 'metrics_summary.csv',
  434. 'attribute_prefix': 'CELLRANGER_VDJT'})
  435. load_scirpy_report_for_vdj_t_to_db = \
  436. PythonOperator(
  437. task_id='load_scirpy_report_for_vdj_t_to_db',
  438. dag=dag,
  439. queue='hpc_4G',
  440. python_callable=load_analysis_files_func,
  441. params={
  442. 'collection_name_task': 'load_cellranger_result_to_db',
  443. 'collection_name_key': 'sample_igf_id',
  444. 'file_name_task': 'run_scirpy_for_vdj_t',
  445. 'file_name_key': 'scirpy_notebook',
  446. 'analysis_name': 'scirpy_vdj_t',
  447. 'collection_type': 'SCIRPY_VDJ_T_HTML',
  448. 'collection_table': 'sample',
  449. 'output_files_key': 'output_db_files'})
  450. upload_scirpy_report_for_vdj_t_to_ftp = \
  451. PythonOperator(
  452. task_id='upload_scirpy_report_for_vdj_t_to_ftp',
  453. dag=dag,
  454. queue='hpc_4G',
  455. python_callable=ftp_files_upload_for_analysis,
  456. params={
  457. 'xcom_pull_task': 'load_scirpy_report_for_vdj_t_to_db',
  458. 'xcom_pull_files_key': 'output_db_files',
  459. 'collection_name_task': 'load_cellranger_result_to_db',
  460. 'collection_name_key': 'sample_igf_id',
  461. 'collection_type': 'FTP_SCIRPY_VDJ_T_HTML',
  462. 'collection_table': 'sample',
  463. 'collect_remote_file': True})
  464. upload_scirpy_report_for_vdj_t_to_box = \
  465. PythonOperator(
  466. task_id='upload_scirpy_report_for_vdj_t_to_box',
  467. dag=dag,
  468. queue='hpc_4G',
  469. python_callable=upload_analysis_file_to_box,
  470. params={
  471. 'xcom_pull_task': 'load_scirpy_report_for_vdj_t_to_db',
  472. 'xcom_pull_files_key': 'output_db_files',
  473. 'analysis_tag': 'scirpy_vdj_t_single_sample_report'})
  474. ## PIPELINE
  475. decide_analysis_branch >> run_scirpy_for_vdj_t
  476. run_scirpy_for_vdj_t >> load_scirpy_report_for_vdj_t_to_db
  477. run_scirpy_for_vdj_t >> load_cellranger_vdjT_matrics_to_db
  478. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_ftp
  479. load_scirpy_report_for_vdj_t_to_db >> upload_scirpy_report_for_vdj_t_to_box
  480. ## TASK
  481. run_seurat_for_sc_5p = \
  482. PythonOperator(
  483. task_id='run_seurat_for_sc_5p',
  484. dag=dag,
  485. queue='hpc_4G',
  486. python_callable=run_singlecell_notebook_wrapper_func,
  487. params={
  488. 'cellranger_xcom_key': 'cellranger_output',
  489. 'cellranger_xcom_pull_task': 'run_cellranger',
  490. 'scanpy_timeout': 1200,
  491. 'allow_errors': False,
  492. 'kernel_name': 'ir',
  493. 'analysis_name': 'seurat',
  494. 'vdj_dir': 'vdj',
  495. 'count_dir': 'count',
  496. 'output_notebook_key': 'seurat_notebook',
  497. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  498. 'analysis_description_xcom_key': 'analysis_description'})
  499. load_seurat_report_for_sc_5p_db = \
  500. PythonOperator(
  501. task_id='load_seurat_report_for_sc_5p_db',
  502. dag=dag,
  503. queue='hpc_4G',
  504. python_callable=load_analysis_files_func,
  505. params={
  506. 'collection_name_task': 'load_cellranger_result_to_db',
  507. 'collection_name_key': 'sample_igf_id',
  508. 'file_name_task': 'run_seurat_for_sc_5p',
  509. 'file_name_key': 'seurat_notebook',
  510. 'analysis_name': 'seurat_5p',
  511. 'collection_type': 'SEURAT_HTML',
  512. 'collection_table': 'sample',
  513. 'output_files_key': 'output_db_files'})
  514. upload_seurat_report_for_sc_5p_ftp = \
  515. PythonOperator(
  516. task_id='upload_seurat_report_for_sc_5p_ftp',
  517. dag=dag,
  518. queue='hpc_4G',
  519. python_callable=ftp_files_upload_for_analysis,
  520. params={
  521. 'xcom_pull_task': 'load_seurat_report_for_sc_5p_db',
  522. 'xcom_pull_files_key': 'output_db_files',
  523. 'collection_name_task': 'load_cellranger_result_to_db',
  524. 'collection_name_key': 'sample_igf_id',
  525. 'collection_type': 'FTP_SEURAT_HTML',
  526. 'collection_table': 'sample',
  527. 'collect_remote_file': True})
  528. upload_seurat_report_for_sc_5p_to_box = \
  529. PythonOperator(
  530. task_id='upload_seurat_report_for_sc_5p_to_box',
  531. dag=dag,
  532. queue='hpc_4G',
  533. python_callable=upload_analysis_file_to_box,
  534. params={
  535. 'xcom_pull_task': 'load_seurat_report_for_sc_5p_db',
  536. 'xcom_pull_files_key': 'output_db_files',
  537. 'analysis_tag': 'seurat_single_sample_report'})
  538. ## PIPELINE
  539. decide_analysis_branch >> run_seurat_for_sc_5p
  540. run_seurat_for_sc_5p >> load_seurat_report_for_sc_5p_db
  541. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_ftp
  542. load_seurat_report_for_sc_5p_db >> upload_seurat_report_for_sc_5p_to_box
  543. ## TASK
  544. convert_cellranger_bam_to_cram = \
  545. PythonOperator(
  546. task_id='convert_cellranger_bam_to_cram',
  547. dag=dag,
  548. queue='hpc_4G4t',
  549. python_callable=convert_bam_to_cram_func,
  550. params={
  551. 'xcom_pull_files_key': 'cellranger_output',
  552. 'xcom_pull_task': 'run_cellranger',
  553. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  554. 'analysis_description_xcom_key': 'analysis_description',
  555. 'use_ephemeral_space': True,
  556. 'threads': 4,
  557. 'cellranger_bam_path': 'count/sample_alignments.bam',
  558. 'analysis_name': 'cellranger',
  559. 'collection_type': 'ANALYSIS_CRAM',
  560. 'collection_table': 'sample',
  561. 'cram_files_xcom_key': 'cram_files'})
  562. ## PIPELINE
  563. decide_analysis_branch >> convert_cellranger_bam_to_cram
  564. ## TASK
  565. generate_cell_sorted_bam = \
  566. PythonOperator(
  567. task_id='generate_cell_sorted_bam',
  568. dag=dag,
  569. queue='hpc_16G8t',
  570. python_callable=generate_cell_sorted_bam_func,
  571. params={
  572. 'xcom_pull_task': 'run_cellranger',
  573. 'xcom_pull_files_key': 'cellranger_output',
  574. 'cellranger_bam_path': 'count/sample_alignments.bam',
  575. 'cellsorted_bam_path': 'count/cellsorted_sample_alignments.bam',
  576. 'samtools_mem': '2G',
  577. 'threads': 7})
  578. run_velocyto = \
  579. PythonOperator(
  580. task_id='run_velocyto',
  581. queue='hpc_16G_long',
  582. python_callable=run_velocyto_func,
  583. params={
  584. 'xcom_pull_task': 'run_cellranger',
  585. 'xcom_pull_files_key': 'cellranger_output',
  586. 'cell_sorted_bam_name': 'count/cellsorted_sample_alignments.bam',
  587. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  588. 'analysis_description_xcom_key': 'analysis_description' })
  589. run_scvelo_for_sc_5p = \
  590. PythonOperator(
  591. task_id='run_scvelo_for_sc_5p',
  592. dag=dag,
  593. queue='hpc_32G16t',
  594. python_callable=run_scvelo_for_sc_5p_func,
  595. params={
  596. 'xcom_pull_task': 'run_cellranger',
  597. 'xcom_pull_files_key': 'cellranger_output',
  598. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  599. 'analysis_description_xcom_key': 'analysis_description',
  600. 'loom_file_key': 'loom_output',
  601. 'loom_file_task': 'run_velocyto',
  602. 'scanpy_h5ad_task':'run_scanpy_for_sc_5p',
  603. 'scanpy_h5ad_key': 'scanpy_h5ad',
  604. 'timeout': 2400,
  605. 'allow_errors': False,
  606. 'cpu_threads': 14,
  607. 'output_notebook_key': 'scvelo_notebook'})
  608. load_loom_file_to_rds = \
  609. PythonOperator(
  610. task_id='load_loom_file_to_rds',
  611. dag=dag,
  612. queue='hpc_4G',
  613. python_callable=load_analysis_files_func,
  614. params={
  615. 'collection_name_task': 'load_cellranger_result_to_db',
  616. 'collection_name_key': 'sample_igf_id',
  617. 'file_name_task': 'run_velocyto',
  618. 'file_name_key': 'loom_output',
  619. 'analysis_name': 'velocyto_5p',
  620. 'collection_type': 'VELOCYTO_LOOM',
  621. 'collection_table': 'sample',
  622. 'output_files_key': 'output_db_files'})
  623. upload_loom_file_to_irods = \
  624. PythonOperator(
  625. task_id='upload_loom_file_to_irods',
  626. dag=dag,
  627. queue='hpc_4G',
  628. python_callable=irods_files_upload_for_analysis,
  629. params={
  630. 'xcom_pull_task': 'load_loom_file_to_rds',
  631. 'xcom_pull_files_key': 'output_db_files',
  632. 'collection_name_key': 'sample_igf_id',
  633. 'collection_name_task': 'load_cellranger_result_to_db',
  634. 'analysis_name': 'velocyto_loom'})
  635. load_scvelo_report_to_rds = \
  636. PythonOperator(
  637. task_id='load_scvelo_report_to_rds',
  638. dag=dag,
  639. queue='hpc_4G',
  640. python_callable=load_analysis_files_func,
  641. params={
  642. 'collection_name_task': 'load_cellranger_result_to_db',
  643. 'collection_name_key': 'sample_igf_id',
  644. 'file_name_task': 'run_scvelo_for_sc_5p',
  645. 'file_name_key': 'scvelo_notebook',
  646. 'analysis_name': 'scvelo_5p',
  647. 'collection_type': 'SCVELO_HTML',
  648. 'collection_table': 'sample',
  649. 'output_files_key': 'output_db_files'})
  650. upload_scvelo_report_to_ftp = \
  651. PythonOperator(
  652. task_id='upload_scvelo_report_to_ftp',
  653. dag=dag,
  654. queue='hpc_4G',
  655. python_callable=ftp_files_upload_for_analysis,
  656. params={
  657. 'xcom_pull_task': 'load_scvelo_report_to_rds',
  658. 'xcom_pull_files_key': 'output_db_files',
  659. 'collection_name_task': 'load_cellranger_result_to_db',
  660. 'collection_name_key': 'sample_igf_id',
  661. 'collection_type': 'FTP_SCVELO_HTML',
  662. 'collection_table': 'sample',
  663. 'collect_remote_file': True})
  664. upload_scvelo_report_to_box = \
  665. PythonOperator(
  666. task_id='upload_scvelo_report_to_box',
  667. dag=dag,
  668. queue='hpc_4G',
  669. python_callable=upload_analysis_file_to_box,
  670. params={
  671. 'xcom_pull_task': 'load_scvelo_report_to_rds',
  672. 'xcom_pull_files_key': 'output_db_files',
  673. 'analysis_tag': 'scvelo_single_sample_report'})
  674. ## PIPELINE
  675. convert_cellranger_bam_to_cram >> generate_cell_sorted_bam
  676. generate_cell_sorted_bam >> run_velocyto
  677. run_velocyto >> run_scvelo_for_sc_5p
  678. run_scanpy_for_sc_5p >> run_scvelo_for_sc_5p
  679. run_velocyto >> load_loom_file_to_rds
  680. load_loom_file_to_rds >> upload_loom_file_to_irods
  681. run_scvelo_for_sc_5p >> load_scvelo_report_to_rds
  682. load_scvelo_report_to_rds >> upload_scvelo_report_to_ftp
  683. load_scvelo_report_to_rds >> upload_scvelo_report_to_box
  684. ## TASK
  685. copy_bam_for_parallel_runs = \
  686. BranchPythonOperator(
  687. task_id='copy_bam_for_parallel_runs',
  688. dag=dag,
  689. queue='hpc_4G',
  690. python_callable=index_and_copy_bam_for_parallel_analysis,
  691. params={
  692. 'xcom_pull_files_key': 'cellranger_output',
  693. 'xcom_pull_task': 'run_cellranger',
  694. 'cellranger_bam_path': 'count/sample_alignments.bam',
  695. 'list_of_tasks': [
  696. 'run_picard_alignment_summary',
  697. 'run_picard_qual_summary',
  698. 'run_picard_rna_summary',
  699. 'run_picard_gc_summary',
  700. 'run_picard_base_dist_summary',
  701. 'run_samtools_stats']})
  702. ## TASK
  703. upload_cram_to_irods = \
  704. PythonOperator(
  705. task_id='upload_cram_to_irods',
  706. dag=dag,
  707. queue='hpc_4G',
  708. python_callable=irods_files_upload_for_analysis,
  709. params={
  710. 'xcom_pull_task': 'convert_cellranger_bam_to_cram',
  711. 'xcom_pull_files_key': 'cram_files',
  712. 'collection_name_key': 'sample_igf_id',
  713. 'collection_name_task': 'load_cellranger_result_to_db',
  714. 'analysis_name': 'cellranger_multi'})
  715. ## PIPELINE
  716. #convert_cellranger_bam_to_cram >> copy_bam_for_parallel_runs # we need to load metrics to cram
  717. generate_cell_sorted_bam >> copy_bam_for_parallel_runs
  718. convert_cellranger_bam_to_cram >> upload_cram_to_irods
  719. ## TASK
  720. run_picard_alignment_summary = \
  721. PythonOperator(
  722. task_id='run_picard_alignment_summary',
  723. dag=dag,
  724. queue='hpc_4G',
  725. python_callable=run_picard_for_cellranger,
  726. params={
  727. 'xcom_pull_files_key': 'run_picard_alignment_summary',
  728. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  729. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  730. 'analysis_description_xcom_key': 'analysis_description',
  731. 'use_ephemeral_space': True,
  732. 'load_metrics_to_cram': True,
  733. 'java_param': '-Xmx4g',
  734. 'picard_command': 'CollectAlignmentSummaryMetrics',
  735. 'picard_option': {},
  736. 'analysis_files_xcom_key': 'picard_alignment_summary',
  737. 'bam_files_xcom_key': None})
  738. ## PIPELINE
  739. copy_bam_for_parallel_runs >> run_picard_alignment_summary
  740. ## TASK
  741. cleanup_picard_alignment_summary_input = \
  742. PythonOperator(
  743. task_id='cleanup_picard_alignment_summary_input',
  744. dag=dag,
  745. queue='hpc_4G',
  746. python_callable=clean_up_files,
  747. params={
  748. 'xcom_pull_files_key': 'run_picard_alignment_summary',
  749. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  750. ## PIPELINE
  751. run_picard_alignment_summary >> cleanup_picard_alignment_summary_input
  752. ## TASK
  753. upload_picard_alignment_summary_to_box = \
  754. PythonOperator(
  755. task_id='upload_picard_alignment_summary_to_box',
  756. dag=dag,
  757. queue='hpc_4G',
  758. python_callable=upload_analysis_file_to_box,
  759. params={
  760. 'xcom_pull_task': 'run_picard_alignment_summary',
  761. 'xcom_pull_files_key': 'picard_alignment_summary',
  762. 'analysis_tag': 'Picard-CollectAlignmentSummaryMetrics'})
  763. ## PIPELINE
  764. run_picard_alignment_summary >> upload_picard_alignment_summary_to_box
  765. ## TASK
  766. run_picard_qual_summary = \
  767. PythonOperator(
  768. task_id='run_picard_qual_summary',
  769. dag=dag,
  770. queue='hpc_4G',
  771. python_callable=run_picard_for_cellranger,
  772. params={
  773. 'xcom_pull_files_key': 'run_picard_qual_summary',
  774. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  775. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  776. 'analysis_description_xcom_key': 'analysis_description',
  777. 'use_ephemeral_space': True,
  778. 'load_metrics_to_cram': True,
  779. 'java_param': '-Xmx4g',
  780. 'picard_command': 'QualityScoreDistribution',
  781. 'picard_option': {},
  782. 'analysis_files_xcom_key': 'picard_qual_summary',
  783. 'bam_files_xcom_key': None})
  784. ## PIPELINE
  785. copy_bam_for_parallel_runs >> run_picard_qual_summary
  786. ## TASK
  787. cleanup_picard_qual_summary_input = \
  788. PythonOperator(
  789. task_id='cleanup_picard_qual_summary_input',
  790. dag=dag,
  791. queue='hpc_4G',
  792. python_callable=clean_up_files,
  793. params={
  794. 'xcom_pull_files_key': 'run_picard_qual_summary',
  795. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  796. ## PIPELINE
  797. run_picard_qual_summary >> cleanup_picard_qual_summary_input
  798. ## TASK
  799. upload_picard_qual_summary_to_box = \
  800. PythonOperator(
  801. task_id='upload_picard_qual_summary_to_box',
  802. dag=dag,
  803. queue='hpc_4G',
  804. python_callable=upload_analysis_file_to_box,
  805. params={
  806. 'xcom_pull_task': 'run_picard_qual_summary',
  807. 'xcom_pull_files_key': 'picard_qual_summary',
  808. 'analysis_tag': 'Picard-QualityScoreDistribution'})
  809. ## PIPELINE
  810. run_picard_qual_summary >> upload_picard_qual_summary_to_box
  811. ## TASK
  812. run_picard_rna_summary = \
  813. PythonOperator(
  814. task_id='run_picard_rna_summary',
  815. dag=dag,
  816. queue='hpc_8G',
  817. python_callable=run_picard_for_cellranger,
  818. params={
  819. 'xcom_pull_files_key': 'run_picard_rna_summary',
  820. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  821. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  822. 'analysis_description_xcom_key': 'analysis_description',
  823. 'use_ephemeral_space': True,
  824. 'load_metrics_to_cram': True,
  825. 'java_param': '-Xmx7g',
  826. 'picard_command': 'CollectRnaSeqMetrics',
  827. 'picard_option': {},
  828. 'analysis_files_xcom_key': 'picard_rna_summary',
  829. 'bam_files_xcom_key': None})
  830. ## PIPELINE
  831. copy_bam_for_parallel_runs >> run_picard_rna_summary
  832. ## TASK
  833. cleanup_picard_rna_summary_input = \
  834. PythonOperator(
  835. task_id='cleanup_picard_rna_summary_input',
  836. dag=dag,
  837. queue='hpc_4G',
  838. python_callable=clean_up_files,
  839. params={
  840. 'xcom_pull_files_key': 'run_picard_rna_summary',
  841. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  842. ## PIPELINE
  843. run_picard_rna_summary >> cleanup_picard_rna_summary_input
  844. ## TASK
  845. upload_picard_rna_summary_to_box = \
  846. PythonOperator(
  847. task_id='upload_picard_rna_summary_to_box',
  848. dag=dag,
  849. queue='hpc_4G',
  850. python_callable=upload_analysis_file_to_box,
  851. params={
  852. 'xcom_pull_task': 'run_picard_rna_summary',
  853. 'xcom_pull_files_key': 'picard_rna_summary',
  854. 'analysis_tag': 'Picard-CollectRnaSeqMetrics'})
  855. ## PIPELINE
  856. run_picard_rna_summary >> upload_picard_rna_summary_to_box
  857. ## TASK
  858. run_picard_gc_summary = \
  859. PythonOperator(
  860. task_id='run_picard_gc_summary',
  861. dag=dag,
  862. queue='hpc_4G',
  863. python_callable=run_picard_for_cellranger,
  864. params={
  865. 'xcom_pull_files_key': 'run_picard_gc_summary',
  866. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  867. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  868. 'analysis_description_xcom_key': 'analysis_description',
  869. 'use_ephemeral_space': True,
  870. 'load_metrics_to_cram': True,
  871. 'java_param': '-Xmx4g',
  872. 'picard_command': 'CollectGcBiasMetrics',
  873. 'picard_option': {},
  874. 'analysis_files_xcom_key': 'picard_gc_summary',
  875. 'bam_files_xcom_key': None})
  876. ## PIPELINE
  877. copy_bam_for_parallel_runs >> run_picard_gc_summary
  878. ## TASK
  879. cleanup_picard_gc_summary_input = \
  880. PythonOperator(
  881. task_id='cleanup_picard_gc_summary_input',
  882. dag=dag,
  883. queue='hpc_4G',
  884. python_callable=clean_up_files,
  885. params={
  886. 'xcom_pull_files_key': 'run_picard_gc_summary',
  887. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  888. ## PIPELINE
  889. run_picard_gc_summary >> cleanup_picard_gc_summary_input
  890. ## TASK
  891. upload_picard_gc_summary_to_box = \
  892. PythonOperator(
  893. task_id='upload_picard_gc_summary_to_box',
  894. dag=dag,
  895. queue='hpc_4G',
  896. python_callable=upload_analysis_file_to_box,
  897. params={
  898. 'xcom_pull_task': 'run_picard_gc_summary',
  899. 'xcom_pull_files_key': 'picard_gc_summary',
  900. 'analysis_tag': 'Picard-CollectGcBiasMetrics'})
  901. ## PIPELINE
  902. run_picard_gc_summary >> upload_picard_gc_summary_to_box
  903. ## TASK
  904. run_picard_base_dist_summary = \
  905. PythonOperator(
  906. task_id='run_picard_base_dist_summary',
  907. dag=dag,
  908. queue='hpc_4G',
  909. python_callable=run_picard_for_cellranger,
  910. params={
  911. 'xcom_pull_files_key': 'run_picard_base_dist_summary',
  912. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  913. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  914. 'analysis_description_xcom_key': 'analysis_description',
  915. 'use_ephemeral_space': True,
  916. 'load_metrics_to_cram': True,
  917. 'java_param': '-Xmx4g',
  918. 'picard_command': 'CollectBaseDistributionByCycle',
  919. 'picard_option': {},
  920. 'analysis_files_xcom_key': 'picard_base_summary',
  921. 'bam_files_xcom_key': None})
  922. ## PIPELINE
  923. copy_bam_for_parallel_runs >> run_picard_base_dist_summary
  924. ## TASK
  925. cleanup_picard_base_dist_summary_input = \
  926. PythonOperator(
  927. task_id='cleanup_picard_base_dist_summary_input',
  928. dag=dag,
  929. queue='hpc_4G',
  930. python_callable=clean_up_files,
  931. params={
  932. 'xcom_pull_files_key': 'run_picard_base_dist_summary',
  933. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  934. ## PIPELINE
  935. run_picard_base_dist_summary >> cleanup_picard_base_dist_summary_input
  936. ## TASK
  937. upload_picard_base_dist_summary_to_box = \
  938. PythonOperator(
  939. task_id='upload_picard_base_dist_summary_to_box',
  940. dag=dag,
  941. queue='hpc_4G',
  942. python_callable=upload_analysis_file_to_box,
  943. params={
  944. 'xcom_pull_task': 'run_picard_base_dist_summary',
  945. 'xcom_pull_files_key': 'picard_base_summary',
  946. 'analysis_tag': 'Picard-CollectBaseDistributionByCycle'})
  947. ## PIPELINE
  948. run_picard_base_dist_summary >> upload_picard_base_dist_summary_to_box
  949. ## TASK
  950. run_samtools_stats = \
  951. PythonOperator(
  952. task_id='run_samtools_stats',
  953. dag=dag,
  954. queue='hpc_4G4t',
  955. python_callable=run_samtools_for_cellranger,
  956. params={
  957. 'xcom_pull_files_key': 'run_samtools_stats',
  958. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  959. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  960. 'analysis_description_xcom_key': 'analysis_description',
  961. 'use_ephemeral_space': True,
  962. 'load_metrics_to_cram': True,
  963. 'samtools_command': 'stats',
  964. 'threads': 4,
  965. 'analysis_files_xcom_key': 'samtools_stats'})
  966. ## PIPELINE
  967. copy_bam_for_parallel_runs >> run_samtools_stats
  968. ## TASK
  969. upload_samtools_stats_to_box = \
  970. PythonOperator(
  971. task_id='upload_samtools_stats_to_box',
  972. dag=dag,
  973. queue='hpc_4G',
  974. python_callable=upload_analysis_file_to_box,
  975. params={
  976. 'xcom_pull_task': 'run_samtools_stats',
  977. 'xcom_pull_files_key': 'samtools_stats',
  978. 'analysis_tag': 'Samtools-stats'})
  979. ## PIPELINE
  980. run_samtools_stats >> upload_samtools_stats_to_box
  981. ## TASK
  982. run_samtools_idxstats = \
  983. PythonOperator(
  984. task_id='run_samtools_idxstats',
  985. dag=dag,
  986. queue='hpc_4G4t',
  987. python_callable=run_samtools_for_cellranger,
  988. params={
  989. 'xcom_pull_files_key': 'run_samtools_stats',
  990. 'xcom_pull_task': 'copy_bam_for_parallel_runs',
  991. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  992. 'analysis_description_xcom_key': 'analysis_description',
  993. 'use_ephemeral_space': True,
  994. 'load_metrics_to_cram': True,
  995. 'samtools_command': 'idxstats',
  996. 'threads': 4,
  997. 'analysis_files_xcom_key': 'samtools_idxstats'})
  998. ## PIPELINE
  999. run_samtools_stats >> run_samtools_idxstats
  1000. ## TASK
  1001. cleanup_samtools_stats_input = \
  1002. PythonOperator(
  1003. task_id='cleanup_samtools_stats_input',
  1004. dag=dag,
  1005. queue='hpc_4G',
  1006. python_callable=clean_up_files,
  1007. params={
  1008. 'xcom_pull_files_key': 'run_samtools_stats',
  1009. 'xcom_pull_task': 'copy_bam_for_parallel_runs'})
  1010. ## PIPELINE
  1011. run_samtools_idxstats >> cleanup_samtools_stats_input
  1012. ## TASK
  1013. upload_samtools_idxstats_to_box = \
  1014. PythonOperator(
  1015. task_id='upload_samtools_idxstats_to_box',
  1016. dag=dag,
  1017. queue='hpc_4G',
  1018. python_callable=upload_analysis_file_to_box,
  1019. params={
  1020. 'xcom_pull_task': 'run_samtools_idxstats',
  1021. 'xcom_pull_files_key': 'samtools_idxstats',
  1022. 'analysis_tag': 'Samtools-idxstats'})
  1023. ## PIPELINE
  1024. run_samtools_idxstats >> upload_samtools_idxstats_to_box
  1025. ## TASK
  1026. run_multiqc = \
  1027. PythonOperator(
  1028. task_id='run_multiqc',
  1029. dag=dag,
  1030. queue='hpc_4G',
  1031. trigger_rule='none_failed_or_skipped',
  1032. python_callable=run_multiqc_for_cellranger,
  1033. params={
  1034. 'list_of_analysis_xcoms_and_tasks': {
  1035. 'run_cellranger': 'cellranger_output',
  1036. 'run_picard_alignment_summary': 'picard_alignment_summary',
  1037. 'run_picard_qual_summary': 'picard_qual_summary',
  1038. 'run_picard_rna_summary': 'picard_rna_summary',
  1039. 'run_picard_gc_summary': 'picard_gc_summary',
  1040. 'run_picard_base_dist_summary': 'picard_base_summary',
  1041. 'run_samtools_stats': 'samtools_stats',
  1042. 'run_samtools_idxstats': 'samtools_idxstats'},
  1043. 'analysis_description_xcom_pull_task': 'fetch_analysis_info',
  1044. 'analysis_description_xcom_key': 'analysis_description',
  1045. 'use_ephemeral_space': True,
  1046. 'multiqc_html_file_xcom_key': 'multiqc_html',
  1047. 'multiqc_data_file_xcom_key': 'multiqc_data',
  1048. 'tool_order_list': ['picad', 'samtools']})
  1049. ## PIPELINE
  1050. run_picard_alignment_summary >> run_multiqc
  1051. run_picard_qual_summary >> run_multiqc
  1052. run_picard_rna_summary >> run_multiqc
  1053. run_picard_gc_summary >> run_multiqc
  1054. run_picard_base_dist_summary >> run_multiqc
  1055. run_samtools_idxstats >> run_multiqc
  1056. ## TASK
  1057. load_multiqc_html = \
  1058. PythonOperator(
  1059. task_id='load_multiqc_html',
  1060. dag=dag,
  1061. queue='hpc_4G',
  1062. python_callable=load_analysis_files_func,
  1063. params={
  1064. 'collection_name_task': 'load_cellranger_result_to_db',
  1065. 'collection_name_key': 'sample_igf_id',
  1066. 'file_name_task': 'run_multiqc',
  1067. 'file_name_key': 'multiqc_html',
  1068. 'analysis_name': 'multiqc',
  1069. 'collection_type': 'MULTIQC_HTML',
  1070. 'collection_table': 'sample',
  1071. 'output_files_key': 'output_db_files'})
  1072. ## PIPELINE
  1073. run_multiqc >> load_multiqc_html
  1074. ## TASK
  1075. upload_multiqc_to_ftp = \
  1076. PythonOperator(
  1077. task_id='upload_multiqc_to_ftp',
  1078. dag=dag,
  1079. queue='hpc_4G',
  1080. python_callable=ftp_files_upload_for_analysis,
  1081. params={
  1082. 'xcom_pull_task': 'load_multiqc_html',
  1083. 'xcom_pull_files_key': 'output_db_files',
  1084. 'collection_name_task': 'load_cellranger_result_to_db',
  1085. 'collection_name_key': 'sample_igf_id',
  1086. 'collection_type': 'FTP_MULTIQC_HTML',
  1087. 'collection_table': 'sample',
  1088. 'collect_remote_file': True})
  1089. ## PIPELINE
  1090. load_multiqc_html >> upload_multiqc_to_ftp
  1091. ## TASK
  1092. upload_multiqc_to_box = \
  1093. PythonOperator(
  1094. task_id='upload_multiqc_to_box',
  1095. dag=dag,
  1096. queue='hpc_4G',
  1097. python_callable=upload_analysis_file_to_box,
  1098. params={
  1099. 'xcom_pull_task': 'load_multiqc_html',
  1100. 'xcom_pull_files_key': 'output_db_files',
  1101. 'analysis_tag': 'multiqc_report'})
  1102. ## PIPELINE
  1103. load_multiqc_html >> upload_multiqc_to_box
  1104. ## TASK
  1105. update_analysis_and_status = \
  1106. PythonOperator(
  1107. task_id='update_analysis_and_status',
  1108. dag=dag,
  1109. queue='hpc_4G',
  1110. python_callable=change_pipeline_status,
  1111. trigger_rule='none_failed_or_skipped',
  1112. params={
  1113. 'new_status': 'FINISHED',
  1114. 'no_change_status': 'SEEDED'})
  1115. ## PIPELINE
  1116. upload_multiqc_to_ftp >> update_analysis_and_status
  1117. upload_scanpy_report_for_sc_5p_to_ftp >> update_analysis_and_status
  1118. upload_scanpy_report_for_sc_5p_to_box >> update_analysis_and_status
  1119. upload_cellbrowser_for_sc_5p_to_ftp >> update_analysis_and_status
  1120. #upload_scirpy_report_for_vdj_to_ftp >> update_analysis_and_status # no more ambiguous VDJ
  1121. #upload_scirpy_report_for_vdj_to_box >> update_analysis_and_status
  1122. upload_scirpy_report_for_vdj_b_to_ftp >> update_analysis_and_status
  1123. upload_scirpy_report_for_vdj_b_to_box >> update_analysis_and_status
  1124. upload_scirpy_report_for_vdj_t_to_ftp >> update_analysis_and_status
  1125. upload_scirpy_report_for_vdj_t_to_box >> update_analysis_and_status
  1126. upload_seurat_report_for_sc_5p_ftp >> update_analysis_and_status
  1127. upload_seurat_report_for_sc_5p_to_box >> update_analysis_and_status
  1128. upload_cellranger_results_to_irods >> update_analysis_and_status
  1129. upload_cellranger_report_to_ftp >> update_analysis_and_status
  1130. upload_cellranger_report_to_box >> update_analysis_and_status
  1131. upload_cram_to_irods >> update_analysis_and_status
  1132. upload_scvelo_report_to_box >> update_analysis_and_status
  1133. upload_scvelo_report_to_ftp >> update_analysis_and_status
  1134. ## TASK
  1135. update_qc_pages = \
  1136. PythonOperator(
  1137. task_id='update_qc_pages',
  1138. dag=dag,
  1139. queue='hpc_4G',
  1140. python_callable=create_and_update_qc_pages,
  1141. params={
  1142. 'use_ephemeral_space': True,
  1143. 'collection_type_list': [
  1144. 'FTP_MULTIQC_HTML',
  1145. 'FTP_SEURAT_HTML',
  1146. 'FTP_SCIRPY_VDJ_T_HTML',
  1147. 'FTP_SCIRPY_VDJ_B_HTML',
  1148. 'FTP_SCIRPY_VDJ_HTML',
  1149. 'FTP_CELLBROWSER',
  1150. 'FTP_SCANPY_HTML',
  1151. 'FTP_SCVELO_HTML',
  1152. 'FTP_CELLRANGER_HTML']})
  1153. ## PIPELINE
  1154. update_analysis_and_status >> update_qc_pages