From 5efb4de2090467c5a9a7f1fbcb177d686f158d50 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 14 Apr 2024 15:03:35 -0400 Subject: [PATCH 01/18] initial commit for bedtools --- docs/guide-deeptools.md | 52 +++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 53 insertions(+) create mode 100644 docs/guide-deeptools.md diff --git a/docs/guide-deeptools.md b/docs/guide-deeptools.md new file mode 100644 index 00000000..74626150 --- /dev/null +++ b/docs/guide-deeptools.md @@ -0,0 +1,52 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.11.3 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# How do I + +## `bedtools intersect` + +### Original entries from the first bed + +```sh +bedtools intersect -wa -a A.bed -b B.bed > out.bed +``` + +```py +overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) +out = A.loc[overlap['index_1']] +``` + +### Intersect with multiple beds + +```sh +bedtools intersect -wa -a A.bed -b B.bed C.bed D.bed> out.bed +``` + +```py +others = pd.concat([B, C, D]) +overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index=True) +out = A.loc[overlap['index_1']] +``` + +### Keep no overlap + +```sh +bedtools intersect -wa -a A.bed -b B.bed -v > out.bed +``` + +```py +overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) +out = A.loc[~A.index.isin(set(overlap['index_1'].unique()))] +``` + diff --git a/docs/index.rst b/docs/index.rst index ebb61ad9..ba473692 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,7 @@ bioframe guide-recipes.md guide-definitions guide-specifications + guide-bedtools .. toctree:: :maxdepth: 1 From 35385eedd42fc4eb1779a87c7de74f842d599dab Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 14 Apr 2024 15:06:31 -0400 Subject: [PATCH 02/18] add entries from second bed --- docs/guide-deeptools.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/guide-deeptools.md b/docs/guide-deeptools.md index 74626150..0cf60eb1 100644 --- a/docs/guide-deeptools.md +++ b/docs/guide-deeptools.md @@ -27,6 +27,17 @@ overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] ``` +### Original entries from the second bed + +```sh +bedtools intersect -wb -a A.bed -b B.bed > out.bed +``` + +```py +overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) +out = B.loc[overlap['index_2']] +``` + ### Intersect with multiple beds ```sh From 75959aa79d4a629826844a07938655e992abd0ce Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 14 Apr 2024 15:07:20 -0400 Subject: [PATCH 03/18] add bedtools flags --- docs/guide-deeptools.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/guide-deeptools.md b/docs/guide-deeptools.md index 0cf60eb1..feee6da0 100644 --- a/docs/guide-deeptools.md +++ b/docs/guide-deeptools.md @@ -16,7 +16,7 @@ kernelspec: ## `bedtools intersect` -### Original entries from the first bed +### Original entries from the first bed `-wa` ```sh bedtools intersect -wa -a A.bed -b B.bed > out.bed @@ -27,7 +27,7 @@ overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] ``` -### Original entries from the second bed +### Original entries from the second bed `-wb` ```sh bedtools intersect -wb -a A.bed -b B.bed > out.bed @@ -50,7 +50,7 @@ overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index= out = A.loc[overlap['index_1']] ``` -### Keep no overlap +### Keep no overlap `-v` ```sh bedtools intersect -wa -a A.bed -b B.bed -v > out.bed From 2b2980c5d80fa201c45f67c442c0007503967d0c Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 14 Apr 2024 15:10:33 -0400 Subject: [PATCH 04/18] typo in tools name --- docs/{guide-deeptools.md => guide-bedtools.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/{guide-deeptools.md => guide-bedtools.md} (100%) diff --git a/docs/guide-deeptools.md b/docs/guide-bedtools.md similarity index 100% rename from docs/guide-deeptools.md rename to docs/guide-bedtools.md From 16036911b56f9ffdc8c63a1d2d8b4b3de991367f Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Mon, 15 Apr 2024 11:54:21 -0400 Subject: [PATCH 05/18] use setdiff --- docs/guide-bedtools.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index feee6da0..5b3de227 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -57,7 +57,6 @@ bedtools intersect -wa -a A.bed -b B.bed -v > out.bed ``` ```py -overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) -out = A.loc[~A.index.isin(set(overlap['index_1'].unique()))] +out = bf.setdiff(A, B) ``` From fb35d09c8125799194978988f05c4bc2d52258cd Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Mon, 15 Apr 2024 11:55:30 -0400 Subject: [PATCH 06/18] change page name --- docs/guide-bedtools.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 5b3de227..a90ec2d2 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -12,7 +12,7 @@ kernelspec: name: python3 --- -# How do I +# Emulating bedtools commands ## `bedtools intersect` From 7a0126fb965b46b28ddc36d5b9374fc87a148629 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Tue, 16 Apr 2024 16:57:01 -0400 Subject: [PATCH 07/18] add -f and -s --- docs/guide-bedtools.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index a90ec2d2..dbdd56c5 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -14,6 +14,10 @@ kernelspec: # Emulating bedtools commands +If you want to work on `gtf` files, you do not need to turn them into bed files, +you can directly read them (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)) +and turn them into bedframe by renaming the `seqname` column into `chrom`. + ## `bedtools intersect` ### Original entries from the first bed `-wa` @@ -60,3 +64,38 @@ bedtools intersect -wa -a A.bed -b B.bed -v > out.bed out = bf.setdiff(A, B) ``` +### Force strandedness `-s` + +For intersection + +```sh +bedtools intersect -wa -a A.bed -b B.bed -s > out.bed +``` + +```py +overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=True) +out = A.loc[overlap['index_1']] +``` + +For non intersection + +```sh +bedtools intersect -wa -a A.bed -b B.bed -v -s > out.bed +``` + +```py +out = bf.setdiff(A, B, on=['strand']) +``` + +### Minimum overlap a as fraction of A `-f` + +We want to keep rows of A that are covered at least 70% by elements from B + +```sh +bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed +``` + +```py +cov = bf.coverage(A, B) +out = A[cov['coverage'] / (cov['end']-cov['start']) ) >=0.70] +``` From fd7fccdf369f1d5ac9a5c7095a0284e953c6e356 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Tue, 16 Apr 2024 18:56:01 -0400 Subject: [PATCH 08/18] change title + mistake --- docs/guide-bedtools.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index dbdd56c5..aa2ea26c 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -12,7 +12,7 @@ kernelspec: name: python3 --- -# Emulating bedtools commands +# Bioframe for bedtools users If you want to work on `gtf` files, you do not need to turn them into bed files, you can directly read them (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)) @@ -73,7 +73,7 @@ bedtools intersect -wa -a A.bed -b B.bed -s > out.bed ``` ```py -overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=True) +overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=True, how='inner') out = A.loc[overlap['index_1']] ``` From 7c4147fd3428758ffb4632e79b310e4319a4197a Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 17 Apr 2024 12:00:14 -0400 Subject: [PATCH 09/18] Update docs/guide-bedtools.md --- docs/guide-bedtools.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index aa2ea26c..a75c598f 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -97,5 +97,5 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed ```py cov = bf.coverage(A, B) -out = A[cov['coverage'] / (cov['end']-cov['start']) ) >=0.70] +out = A[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] ``` From f23926359a99d05eccf00270fe966a0488dc4a3d Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Tue, 23 Apr 2024 16:42:53 -0400 Subject: [PATCH 10/18] add --- docs/guide-bedtools.md | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index a75c598f..d3f6e726 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -20,7 +20,22 @@ and turn them into bedframe by renaming the `seqname` column into `chrom`. ## `bedtools intersect` -### Original entries from the first bed `-wa` +### Original unique entries from the first bed `-u` + +Note that this gives one row per overlap and can contain duplicates, + +```sh +bedtools intersect -u -a A.bed -b B.bed > out.bed +``` + +```py +overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) +out = A.loc[overlap['index_1'].unique()] +``` + +### Original entries from the first bed for each overlap`-wa` + +Note that this gives one row per overlap and can contain duplicates, ```sh bedtools intersect -wa -a A.bed -b B.bed > out.bed @@ -87,7 +102,7 @@ bedtools intersect -wa -a A.bed -b B.bed -v -s > out.bed out = bf.setdiff(A, B, on=['strand']) ``` -### Minimum overlap a as fraction of A `-f` +### Minimum overlap as a fraction of A `-f` We want to keep rows of A that are covered at least 70% by elements from B @@ -97,5 +112,5 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed ```py cov = bf.coverage(A, B) -out = A[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] +out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] ``` From cda26129666cd0803d623f3cbfa7aa7d6d95c231 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Wed, 24 Apr 2024 00:25:12 -0400 Subject: [PATCH 11/18] add more file formats and -c --- docs/guide-bedtools.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index d3f6e726..1b07154a 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -14,10 +14,17 @@ kernelspec: # Bioframe for bedtools users -If you want to work on `gtf` files, you do not need to turn them into bed files, -you can directly read them (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)) +If you work with bed files you can simply load them using `read_table`, it will +create a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) +which supports all the bioframe operations. + +Altertantively if you want to work on `gtf` files, you do not need to turn them +into bed files, you can directly read them (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)) and turn them into bedframe by renaming the `seqname` column into `chrom`. +Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support +all the following operations TODO `API_fileops` + ## `bedtools intersect` ### Original unique entries from the first bed `-u` @@ -33,6 +40,18 @@ overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1'].unique()] ``` +### Report the number of hits in B `-c` + +Reports 0 for A entries that have no overlap with B. + +```sh +bedtools intersect -c -a A.bed -b B.bed > out.bed +``` + +```py +out = bf.count_overlaps(A, B) +``` + ### Original entries from the first bed for each overlap`-wa` Note that this gives one row per overlap and can contain duplicates, @@ -114,3 +133,4 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed cov = bf.coverage(A, B) out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] ``` + From 2713ac2f8966a4457810eacfdfa95cbcb276f9a9 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Wed, 24 Apr 2024 00:33:11 -0400 Subject: [PATCH 12/18] add to_bed --- docs/guide-bedtools.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 1b07154a..8b4d2ff2 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -25,6 +25,9 @@ and turn them into bedframe by renaming the `seqname` column into `chrom`. Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support all the following operations TODO `API_fileops` +You can write the output of your operations back to a bed file using `to_bed` which will +generally be able to infer the bed format used. + ## `bedtools intersect` ### Original unique entries from the first bed `-u` @@ -133,4 +136,3 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed cov = bf.coverage(A, B) out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] ``` - From f1f5238e57fc4c2bf83116a262c1166b3c5e2c62 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Fri, 26 Apr 2024 00:45:00 -0400 Subject: [PATCH 13/18] Add alternative implementations --- docs/api-fileops.rst | 2 ++ docs/guide-bedtools.md | 27 +++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/docs/api-fileops.rst b/docs/api-fileops.rst index 4f55bc30..9acb04b4 100644 --- a/docs/api-fileops.rst +++ b/docs/api-fileops.rst @@ -1,3 +1,5 @@ +.. _API_fileops: + File I/O ======== diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 8b4d2ff2..8e755157 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -14,19 +14,18 @@ kernelspec: # Bioframe for bedtools users -If you work with bed files you can simply load them using `read_table`, it will -create a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) -which supports all the bioframe operations. -Altertantively if you want to work on `gtf` files, you do not need to turn them -into bed files, you can directly read them (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)) -and turn them into bedframe by renaming the `seqname` column into `chrom`. +bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk. -Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support -all the following operations TODO `API_fileops` +Bioframe supports reading a number of standard genomics text file formats via [`read_table`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.fileops.read_table), including BED files (see [schemas](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py)), which will load them as pandas DataFrames, a complete list of helper functions is [available here](API_fileops). + +For example, with gtf files, you do not need to turn them into bed files, you can directly read them into pandas (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)). +For gtfs, it is often convenient to rename the seqname column into chrom, the default column name used in bioframe. + +Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support the genomic [interval operations in bioframe](API_ops). + +Finally, if needed, bioframe provides a convenience function to write the back to a bed file using `to_bed`. -You can write the output of your operations back to a bed file using `to_bed` which will -generally be able to infer the bed format used. ## `bedtools intersect` @@ -64,8 +63,10 @@ bedtools intersect -wa -a A.bed -b B.bed > out.bed ``` ```py -overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) +overlap = bf.overlap(A, B, suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] +# Alternatively +out = A.loc[bioframe.ops._overlap_intidxs(A, B, how='inner')[:,0]] ``` ### Original entries from the second bed `-wb` @@ -87,7 +88,7 @@ bedtools intersect -wa -a A.bed -b B.bed C.bed D.bed> out.bed ```py others = pd.concat([B, C, D]) -overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index=True) +overlap = bf.overlap(A, others, suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] ``` @@ -135,4 +136,6 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed ```py cov = bf.coverage(A, B) out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] +# alternatively +out = bf.coverage(A, B).query('coverage / (end - start) >= 0.7')[A.columns] ``` From 2c2647b527e16eb0eb5c3a02938b6a176bd148b4 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Fri, 26 Apr 2024 00:48:18 -0400 Subject: [PATCH 14/18] forgot the inner --- docs/guide-bedtools.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 8e755157..9b2eab17 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -63,7 +63,7 @@ bedtools intersect -wa -a A.bed -b B.bed > out.bed ``` ```py -overlap = bf.overlap(A, B, suffixes=('_1','_2'), return_index=True) +overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] # Alternatively out = A.loc[bioframe.ops._overlap_intidxs(A, B, how='inner')[:,0]] @@ -88,7 +88,7 @@ bedtools intersect -wa -a A.bed -b B.bed C.bed D.bed> out.bed ```py others = pd.concat([B, C, D]) -overlap = bf.overlap(A, others, suffixes=('_1','_2'), return_index=True) +overlap = bf.overlap(A, others, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] ``` From 600bdafa79532a164dae2cfd9fe15ea9159da8fc Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 26 Apr 2024 06:09:36 -0400 Subject: [PATCH 15/18] Remove intidxs and describe differences in indexes in output --- docs/guide-bedtools.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 9b2eab17..658b8fde 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -66,9 +66,9 @@ bedtools intersect -wa -a A.bed -b B.bed > out.bed overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] # Alternatively -out = A.loc[bioframe.ops._overlap_intidxs(A, B, how='inner')[:,0]] +out = bf.overlap(A, B, how='inner')[A.columns] ``` - +**Note:** The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result (the join output) will have an integer range index. ### Original entries from the second bed `-wb` ```sh @@ -78,6 +78,9 @@ bedtools intersect -wb -a A.bed -b B.bed > out.bed ```py overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = B.loc[overlap['index_2']] + +# Alternatively +out = bf.overlap(A, B, how='inner', suffixes=("_", ""))[B.columns] ``` ### Intersect with multiple beds From 1bc4fe24e8b375125870c3feaeeae2dcdaff6e16 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 26 Apr 2024 06:10:05 -0400 Subject: [PATCH 16/18] Update docs/guide-bedtools.md --- docs/guide-bedtools.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 658b8fde..314bc332 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -69,6 +69,7 @@ out = A.loc[overlap['index_1']] out = bf.overlap(A, B, how='inner')[A.columns] ``` **Note:** The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result (the join output) will have an integer range index. + ### Original entries from the second bed `-wb` ```sh From d4702b2b94e89802de512ee8cb36225ba8ae0e23 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 26 Apr 2024 06:40:38 -0400 Subject: [PATCH 17/18] Notes about indexes --- docs/api-fileops.rst | 2 ++ docs/guide-bedtools.md | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/api-fileops.rst b/docs/api-fileops.rst index 9acb04b4..6b5aadfc 100644 --- a/docs/api-fileops.rst +++ b/docs/api-fileops.rst @@ -6,3 +6,5 @@ File I/O .. automodule:: bioframe.io.fileops :autosummary: :members: + +.. autofunction:: bioframe.io.bed.to_bed \ No newline at end of file diff --git a/docs/guide-bedtools.md b/docs/guide-bedtools.md index 314bc332..a35905f1 100644 --- a/docs/guide-bedtools.md +++ b/docs/guide-bedtools.md @@ -15,24 +15,21 @@ kernelspec: # Bioframe for bedtools users -bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk. +Bioframe is built around the analysis of genomic intervals as a pandas [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) in memory, rather than working with tab-delimited text files saved on disk. Bioframe supports reading a number of standard genomics text file formats via [`read_table`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.fileops.read_table), including BED files (see [schemas](https://github.com/open2c/bioframe/blob/main/bioframe/io/schemas.py)), which will load them as pandas DataFrames, a complete list of helper functions is [available here](API_fileops). -For example, with gtf files, you do not need to turn them into bed files, you can directly read them into pandas (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)). -For gtfs, it is often convenient to rename the seqname column into chrom, the default column name used in bioframe. +Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support the genomic [interval operations in bioframe](API_ops). The names of these columns can also be customized via the `cols=` arguments in bioframe functions. -Any DataFrame object with `'chrom'`, `'start'`, and `'end'` columns will support the genomic [interval operations in bioframe](API_ops). +For example, with gtf files, you do not need to turn them into bed files, you can directly read them into pandas (with e.g. [gtfparse](https://github.com/openvax/gtfparse/tree/master)). For gtfs, it is often convenient to rename the `'seqname'` column to `'chrom'`, the default column name used in bioframe. -Finally, if needed, bioframe provides a convenience function to write the back to a bed file using `to_bed`. +Finally, if needed, bioframe provides a convenience function to write dataframes to a standard BED file using [`to_bed`](https://bioframe.readthedocs.io/en/latest/api-fileops.html#bioframe.io.bed.to_bed). ## `bedtools intersect` ### Original unique entries from the first bed `-u` -Note that this gives one row per overlap and can contain duplicates, - ```sh bedtools intersect -u -a A.bed -b B.bed > out.bed ``` @@ -56,8 +53,6 @@ out = bf.count_overlaps(A, B) ### Original entries from the first bed for each overlap`-wa` -Note that this gives one row per overlap and can contain duplicates, - ```sh bedtools intersect -wa -a A.bed -b B.bed > out.bed ``` @@ -65,10 +60,12 @@ bedtools intersect -wa -a A.bed -b B.bed > out.bed ```py overlap = bf.overlap(A, B, how='inner', suffixes=('_1','_2'), return_index=True) out = A.loc[overlap['index_1']] + # Alternatively out = bf.overlap(A, B, how='inner')[A.columns] ``` -**Note:** The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result (the join output) will have an integer range index. + +> **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `A`, while the latter result --- the join output --- will have an integer range index, like a pandas merge. ### Original entries from the second bed `-wb` @@ -84,6 +81,8 @@ out = B.loc[overlap['index_2']] out = bf.overlap(A, B, how='inner', suffixes=("_", ""))[B.columns] ``` +> **Note:** This gives one row per overlap and can contain duplicates. The output dataframe of the former method will use the same pandas index as the input dataframe `B`, while the latter result --- the join output --- will have an integer range index, like a pandas merge. + ### Intersect with multiple beds ```sh @@ -119,7 +118,7 @@ overlap = bf.overlap(A, B, on=['strand'], suffixes=('_1','_2'), return_index=Tru out = A.loc[overlap['index_1']] ``` -For non intersection +For non-intersection `-v` ```sh bedtools intersect -wa -a A.bed -b B.bed -v -s > out.bed @@ -140,6 +139,7 @@ bedtools intersect -wa -a A.bed -b B.bed -f 0.7 > out.bed ```py cov = bf.coverage(A, B) out = A.loc[cov['coverage'] / (cov['end'] - cov['start']) ) >= 0.70] -# alternatively + +# Alternatively out = bf.coverage(A, B).query('coverage / (end - start) >= 0.7')[A.columns] ``` From 122f4367be4d46f6ac16321496ccba2f48510138 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:40:53 +0000 Subject: [PATCH 18/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/api-fileops.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api-fileops.rst b/docs/api-fileops.rst index 6b5aadfc..4a498f96 100644 --- a/docs/api-fileops.rst +++ b/docs/api-fileops.rst @@ -7,4 +7,4 @@ File I/O :autosummary: :members: -.. autofunction:: bioframe.io.bed.to_bed \ No newline at end of file +.. autofunction:: bioframe.io.bed.to_bed