bernardo-de-almeida commited on
Commit
a82ff3a
·
1 Parent(s): 42f0385

feat: add functional tracks pipeline notebook

Browse files
.gitattributes CHANGED
@@ -36,3 +36,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  assets/paper_summary.jpg filter=lfs diff=lfs merge=lfs -text
37
  assets/paper_summary.png filter=lfs diff=lfs merge=lfs -text
38
  assets/output_tracks.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
36
  assets/paper_summary.jpg filter=lfs diff=lfs merge=lfs -text
37
  assets/paper_summary.png filter=lfs diff=lfs merge=lfs -text
38
  assets/output_tracks.png filter=lfs diff=lfs merge=lfs -text
39
+ notebooks_pipelines/bigwig_outputs/K562_DNAse.bw filter=lfs diff=lfs merge=lfs -text
40
+ notebooks_pipelines/bigwig_outputs/K562_H3k4me3.bw filter=lfs diff=lfs merge=lfs -text
41
+ notebooks_pipelines/bigwig_outputs/K562_RNA_seq.bw filter=lfs diff=lfs merge=lfs -text
42
+ notebooks_pipelines/bigwig_outputs/HepG2_CTCF.bw filter=lfs diff=lfs merge=lfs -text
43
+ notebooks_pipelines/bigwig_outputs/HepG2_DNAse.bw filter=lfs diff=lfs merge=lfs -text
44
+ notebooks_pipelines/bigwig_outputs/HepG2_H3k4me3.bw filter=lfs diff=lfs merge=lfs -text
45
+ notebooks_pipelines/bigwig_outputs/HepG2_RNA_seq.bw filter=lfs diff=lfs merge=lfs -text
46
+ notebooks_pipelines/bigwig_outputs/K562_CTCF.bw filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -3,7 +3,7 @@ title: NTv3 — Foundation Models for Long-Range Genomics
3
  emoji: 🧬
4
  colorFrom: indigo
5
  colorTo: blue
6
- sdk: gradio
7
  pinned: false
8
  ---
9
 
 
3
  emoji: 🧬
4
  colorFrom: indigo
5
  colorTo: blue
6
+ sdk: static
7
  pinned: false
8
  ---
9
 
app.py DELETED
@@ -1,11 +0,0 @@
1
- """
2
- Main Gradio app entry point for NTv3 Space.
3
- This file imports the track prediction demo from app_tracks.py.
4
- """
5
- from app_tracks import demo_interface
6
-
7
- # For Hugging Face Spaces with Gradio SDK, the 'demo' variable must be named 'demo'
8
- demo = demo_interface
9
-
10
- if __name__ == "__main__":
11
- demo.launch(server_name="0.0.0.0", share=False)
 
 
 
 
 
 
 
 
 
 
 
 
app_tracks.py DELETED
@@ -1,158 +0,0 @@
1
- """
2
- Gradio app for NTv3 track prediction demo.
3
- This module contains the interactive track prediction interface.
4
- """
5
- import gradio as gr
6
- import torch
7
- from transformers import pipeline
8
- import os
9
-
10
- # Initialize the pipeline (will be loaded on first use)
11
- ntv3_tracks = None
12
-
13
- def load_pipeline():
14
- """Load the pipeline on first use (lazy loading)."""
15
- global ntv3_tracks
16
- if ntv3_tracks is None:
17
- model_name = "InstaDeepAI/NTv3_650M_pos"
18
- ntv3_tracks = pipeline(
19
- "ntv3-tracks",
20
- model=model_name,
21
- trust_remote_code=True,
22
- device=0 if torch.cuda.is_available() else -1,
23
- )
24
- return ntv3_tracks
25
-
26
- def predict_tracks(chrom, start, end, species):
27
- """Run track prediction on the specified genomic region."""
28
- try:
29
- # Validate inputs
30
- if not chrom or not start or not end or not species:
31
- return "❌ Please fill in all fields."
32
-
33
- start = int(start)
34
- end = int(end)
35
-
36
- if start >= end:
37
- return "❌ Start position must be less than end position."
38
-
39
- if end - start > 1_000_000:
40
- return "❌ Region size cannot exceed 1 Mb (1,000,000 bp)."
41
-
42
- # Load pipeline
43
- pipe = load_pipeline()
44
-
45
- # Run prediction
46
- out = pipe({
47
- "chrom": chrom,
48
- "start": start,
49
- "end": end,
50
- "species": species.lower()
51
- })
52
-
53
- # Format output
54
- result = f"""✅ Prediction completed successfully!
55
-
56
- 📊 Output Shapes:
57
- • BigWig tracks logits: {tuple(out.bigwig_tracks_logits.shape)}
58
- → {out.bigwig_tracks_logits.shape[1]} functional tracks over the center region
59
-
60
- • BED tracks logits: {tuple(out.bed_tracks_logits.shape)}
61
- → {out.bed_tracks_logits.shape[1]} genomic elements over the center region
62
-
63
- • Language model logits: {tuple(out.mlm_logits.shape)}
64
- → MLM predictions for the entire sequence
65
-
66
- 📝 Note: Predictions are made over 37.5% of the center region of the input sequence.
67
- """
68
- return result
69
-
70
- except Exception as e:
71
- return f"❌ Error: {str(e)}"
72
-
73
- # Create the track prediction demo interface (embedded in HTML)
74
- def create_demo_interface():
75
- """Create the Gradio interface for track prediction."""
76
- with gr.Blocks(title="NTv3 Track Prediction Demo", theme=gr.themes.Soft()) as demo_interface:
77
- gr.Markdown("""
78
- # 🧬 NTv3 Interactive Track Prediction Demo
79
-
80
- This demo allows you to run the NTv3 650M post-trained model to predict functional tracks and genomic elements for any genomic region.
81
-
82
- **Model:** `InstaDeepAI/NTv3_650M_pos`
83
- """)
84
-
85
- with gr.Row():
86
- with gr.Column():
87
- chrom = gr.Textbox(
88
- label="Chromosome",
89
- placeholder="e.g., chr19",
90
- value="chr19",
91
- info="Chromosome name (e.g., chr1, chr19)"
92
- )
93
- start = gr.Number(
94
- label="Start Position",
95
- placeholder="e.g., 6700000",
96
- value=6_700_000,
97
- info="Start position in base pairs"
98
- )
99
- end = gr.Number(
100
- label="End Position",
101
- placeholder="e.g., 6831072",
102
- value=6_831_072,
103
- info="End position in base pairs"
104
- )
105
- species = gr.Dropdown(
106
- label="Species",
107
- choices=[
108
- "human", "mouse", "rat", "chicken", "zebrafish",
109
- "fruitfly", "worm", "yeast", "arabidopsis", "rice",
110
- "maize", "soybean", "tomato", "potato", "grape",
111
- "poplar", "medicago", "lotus", "brachypodium", "sorghum",
112
- "barley", "wheat", "oats", "rye"
113
- ],
114
- value="human",
115
- info="Select the species (24 supported species)"
116
- )
117
- predict_btn = gr.Button("🚀 Run Prediction", variant="primary")
118
-
119
- with gr.Column():
120
- output = gr.Textbox(
121
- label="Results",
122
- lines=15,
123
- interactive=False,
124
- placeholder="Results will appear here after running prediction..."
125
- )
126
-
127
- gr.Markdown("""
128
- ### 📝 Notes:
129
- - The model predicts ~7k functional tracks and 21 genomic elements
130
- - Predictions are made over 37.5% of the center region of the input sequence
131
- - Maximum region size: 1 Mb (1,000,000 base pairs)
132
- - First run may take longer as the model loads
133
- """)
134
-
135
- predict_btn.click(
136
- fn=predict_tracks,
137
- inputs=[chrom, start, end, species],
138
- outputs=output
139
- )
140
-
141
- gr.Examples(
142
- examples=[
143
- ["chr19", 6_700_000, 6_831_072, "human"],
144
- ["chr1", 100_000, 200_000, "human"],
145
- ["chr2", 50_000, 150_000, "mouse"],
146
- ],
147
- inputs=[chrom, start, end, species]
148
- )
149
-
150
- return demo_interface
151
-
152
- # Create the demo interface
153
- demo_interface = create_demo_interface()
154
-
155
- # If running this file directly (for local testing)
156
- if __name__ == "__main__":
157
- demo_interface.launch(server_name="0.0.0.0", share=False)
158
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
index.html CHANGED
@@ -273,7 +273,8 @@
273
  <!-- Tab Navigation -->
274
  <div class="tabs">
275
  <button class="tab-button active" data-tab="home">🏠 Home</button>
276
- <button class="tab-button" data-tab="demo">💻 Code Demo</button>
 
277
  </div>
278
 
279
  <!-- Home Tab (Content loaded from tabs/home.html) -->
@@ -281,10 +282,15 @@
281
  <!-- Content will be loaded dynamically -->
282
  </div>
283
 
284
- <!-- Code Demo Tab (Content loaded from tabs/demo.html) -->
285
- <div id="demo" class="tab-content">
286
  <!-- Content will be loaded dynamically -->
287
- </div>
 
 
 
 
 
288
 
289
  <!-- <div class="paper-summary">
290
  <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
@@ -301,7 +307,8 @@
301
  // Tab content mapping
302
  const tabFiles = {
303
  'home': 'tabs/home.html',
304
- 'demo': 'tabs/demo.html'
 
305
  };
306
 
307
  // Cache for loaded tab content
 
273
  <!-- Tab Navigation -->
274
  <div class="tabs">
275
  <button class="tab-button active" data-tab="home">🏠 Home</button>
276
+ <button class="tab-button" data-tab="functional_tracks">💻 Code Demo</button>
277
+ <button class="tab-button" data-tab="annotation">🧬 Genome Annotation</button>
278
  </div>
279
 
280
  <!-- Home Tab (Content loaded from tabs/home.html) -->
 
282
  <!-- Content will be loaded dynamically -->
283
  </div>
284
 
285
+ <!-- Functional Tracks Tab (Content loaded from tabs/functional_tracks.html) -->
286
+ <div id="functional_tracks" class="tab-content">
287
  <!-- Content will be loaded dynamically -->
288
+ </div>
289
+
290
+ <!-- Genome Annotation Tab (Content loaded from tabs/annotation.html) -->
291
+ <div id="annotation" class="tab-content">
292
+ <!-- Content will be loaded dynamically -->
293
+ </div>
294
 
295
  <!-- <div class="paper-summary">
296
  <h2>📄 A foundational model for joint sequence-function multi-species modeling at scale for long-range genomic prediction</h2>
 
307
  // Tab content mapping
308
  const tabFiles = {
309
  'home': 'tabs/home.html',
310
+ 'functional_tracks': 'tabs/functional_tracks.html',
311
+ 'annotation': 'tabs/annotation.html'
312
  };
313
 
314
  // Cache for loaded tab content
notebooks_pipelines/01_functional_track_prediction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks_pipelines/02_genome_annotation.ipynb CHANGED
@@ -11,7 +11,7 @@
11
  "\n",
12
  "The pipeline abstracts away all the underlying steps: running inference with the model, retrieving and processing the predicted probabilities, and applying the HMM to generate a consistent annotation. It returns a ready-to-use GFF file that can be visualized in any genome browser for the sequence of interest.\n",
13
  "\n",
14
- "If you’re interested in exploring the intermediate probabilities, please refer to the [track-prediction notebook](https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/01_tracks_prediction.ipynb). These probabilities can be useful for assessing model confidence and identifying potentially interesting biological regions. This notebook focuses on the higher-level task of producing gene annotations directly from raw DNA.\n",
15
  "\n",
16
  "> 📝 **Note for Google Colab users:** This notebook is compatible with Colab! For faster inference, make sure to enable GPU: Runtime → Change runtime type → GPU (T4 or better recommended)."
17
  ]
@@ -184,7 +184,7 @@
184
  "id": "190ff65e",
185
  "metadata": {},
186
  "source": [
187
- "## 4) 📁 Save a GFF file"
188
  ]
189
  },
190
  {
@@ -266,7 +266,7 @@
266
  },
267
  {
268
  "cell_type": "code",
269
- "execution_count": 8,
270
  "id": "0904a5cb",
271
  "metadata": {},
272
  "outputs": [
@@ -325,7 +325,7 @@
325
  ],
326
  "source": [
327
  "config = {\n",
328
- " \"genome\": \"hg38\", # built-in hg38\n",
329
  " \"locus\": f\"{chrom}:{start}-{end}\",\n",
330
  "}\n",
331
  "\n",
 
11
  "\n",
12
  "The pipeline abstracts away all the underlying steps: running inference with the model, retrieving and processing the predicted probabilities, and applying the HMM to generate a consistent annotation. It returns a ready-to-use GFF file that can be visualized in any genome browser for the sequence of interest.\n",
13
  "\n",
14
+ "If you’re interested in exploring the intermediate probabilities, please refer to the [track-prediction notebook](https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb). These probabilities can be useful for assessing model confidence and identifying potentially interesting biological regions. This notebook focuses on the higher-level task of producing gene annotations directly from raw DNA.\n",
15
  "\n",
16
  "> 📝 **Note for Google Colab users:** This notebook is compatible with Colab! For faster inference, make sure to enable GPU: Runtime → Change runtime type → GPU (T4 or better recommended)."
17
  ]
 
184
  "id": "190ff65e",
185
  "metadata": {},
186
  "source": [
187
+ "## 4) 📁 Save as GFF file"
188
  ]
189
  },
190
  {
 
266
  },
267
  {
268
  "cell_type": "code",
269
+ "execution_count": null,
270
  "id": "0904a5cb",
271
  "metadata": {},
272
  "outputs": [
 
325
  ],
326
  "source": [
327
  "config = {\n",
328
+ " \"genome\": assembly,\n",
329
  " \"locus\": f\"{chrom}:{start}-{end}\",\n",
330
  "}\n",
331
  "\n",
notebooks_pipelines/bigwig_outputs/HepG2_CTCF.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa369433dcd408740311d0a0c5209fbb8889402a82fdad222cc11413dcaf1f1a
3
+ size 380493
notebooks_pipelines/bigwig_outputs/HepG2_DNAse.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a59a1a90a9337b0862c27f4bc492e72f7d78ad0d3ffa5fdf7a981faca8d55cc
3
+ size 387370
notebooks_pipelines/bigwig_outputs/HepG2_H3k4me3.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:349d8edea3908f828dbb1946d9ab16f7220b61a5922d6a7b75ac4fa55b5f359a
3
+ size 381439
notebooks_pipelines/bigwig_outputs/HepG2_RNA_seq.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a116151b3b1550916f49093ca128326338911ee070481480e994a0baa1b00d4f
3
+ size 381391
notebooks_pipelines/bigwig_outputs/K562_CTCF.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf84025d20d7ec59efbf02e7f0d20d67e3bb9564ffe3538e2397c0a46a576aea
3
+ size 379394
notebooks_pipelines/bigwig_outputs/K562_DNAse.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8333fc13cfabe984cc303c575c2e65da20908240f8c733658b5b309a4191cb07
3
+ size 381686
notebooks_pipelines/bigwig_outputs/K562_H3k4me3.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:074971cc028eab364a6cb7642bcc0cf70603de6f3f3f425c600b3b6f90699f32
3
+ size 383184
notebooks_pipelines/bigwig_outputs/K562_RNA_seq.bw ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4118a761250629de70de55c935b38f53a0c0eb6dc2156dcb632335c1ef7f42
3
+ size 380637
requirements.txt DELETED
@@ -1,7 +0,0 @@
1
- gradio>=4.0.0
2
- torch>=2.0.0
3
- transformers>=4.55.0
4
- accelerate>=0.20.0
5
- safetensors>=0.3.0
6
- huggingface_hub>=0.23.0
7
-
 
 
 
 
 
 
 
 
tabs/annotation.html ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="summary">
2
+ <h2>🧬 NTv3 Post-Trained Genome Annotation</h2>
3
+ <p>This notebook demonstrates how to use the NTv3 post-trained model to perform genome annotation directly from a DNA sequence. It relies on a pipeline that applies a Hidden Markov Model (HMM) to the per-base probabilities returned by NTv3, converting them into a coherent gene model that respects biological constraints and valid transitions between genomic elements.</p>
4
+ <p>The pipeline abstracts away all the underlying steps: running inference with the model, retrieving and processing the predicted probabilities, and applying the HMM to generate a consistent annotation. It returns a ready-to-use GFF file that can be visualized in any genome browser for the sequence of interest.</p>
5
+ <p>If you're interested in exploring the intermediate probabilities, please refer to the <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks/01_tracks_prediction.ipynb" target="_blank" rel="noopener">track-prediction notebook</a>. These probabilities can be useful for assessing model confidence and identifying potentially interesting biological regions. This notebook focuses on the higher-level task of producing gene annotations directly from raw DNA.</p>
6
+ <p><strong>📝 Note for Google Colab users:</strong> This notebook is compatible with Colab! For faster inference, make sure to enable GPU: Runtime → Change runtime type → GPU (T4 or better recommended).</p>
7
+ </div>
8
+
9
+ <div class="grid">
10
+ <div class="card" style="grid-column: span 12;">
11
+ <h2>0) 📦 Imports + setup</h2>
12
+ <p>Install dependencies:</p>
13
+ <div class="code"><pre><code class="language-bash">pip -q install "transformers>=4.55" "huggingface_hub>=0.23" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook</code></pre></div>
14
+
15
+ <p style="margin-top: 20px;">Import required libraries:</p>
16
+ <div class="code"><pre><code class="language-python">import re
17
+ import time
18
+ import torch
19
+ import requests
20
+ from transformers import pipeline</code></pre></div>
21
+ </div>
22
+
23
+ <div class="card" style="grid-column: span 12;">
24
+ <h2>1) 📦 Configuration</h2>
25
+ <p>Set your NTv3 model and genomic window here:</p>
26
+ <div class="code"><pre><code class="language-python"># Define the model and genomic window
27
+ model_name = "InstaDeepAI/NTv3_650M_pos"
28
+ assembly = "hg38"
29
+ chrom = "chr19"
30
+ start = 6_700_000
31
+ end = 6_831_072</code></pre></div>
32
+ </div>
33
+
34
+ <div class="card" style="grid-column: span 12;">
35
+ <h2>2) 📥 Fetch chromosome sequence for the chosen window</h2>
36
+ <div class="code"><pre><code class="language-python"># Get the sequence from the UCSC API
37
+ url = f"https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
38
+ seq = requests.get(url).json()["dna"].upper()
39
+ print(f"Original sequence length: {len(seq)}")
40
+
41
+ # Crop to multiple of 128 (the pipeline will crop again, but this is a no-op once divisible)
42
+ seq = seq[:int(len(seq) // 128) * 128]
43
+ print(f"Cropped sequence length: {len(seq)}, {len(seq) / 128} transformer tokens")</code></pre></div>
44
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
45
+ <strong>Example output:</strong><br>
46
+ Original sequence length: 131072<br>
47
+ Cropped sequence length: 131072, 1024.0 transformer tokens
48
+ </p>
49
+ </div>
50
+
51
+ <div class="card" style="grid-column: span 12;">
52
+ <h2>3) ⚡ Genome annotation pipeline (pre-processing, inference, post-processing)</h2>
53
+ <div class="code"><pre><code class="language-python"># Build NTv3 GFF pipeline
54
+ ntv3_gff = pipeline(
55
+ "ntv3-gff",
56
+ model=model_name,
57
+ trust_remote_code=True,
58
+ device=0 if torch.cuda.is_available() else -1,
59
+ )
60
+
61
+ # Run pipeline: DNA -> NTv3 -> HMM -> GFF3
62
+ inputs = {
63
+ "sequence": seq,
64
+ "chrom": chrom,
65
+ "start": start,
66
+ "end": end,
67
+ "assembly": assembly,
68
+ }
69
+
70
+ # Run the pipeline
71
+ start_time = time.time()
72
+ gff_text = ntv3_gff(inputs)
73
+ end_time = time.time()
74
+ print(f"Inference + decoding time: {end_time - start_time:.2f} seconds")</code></pre></div>
75
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
76
+ The pipeline performs all the necessary steps: running inference with the model, retrieving and processing the predicted probabilities, and applying the HMM to generate a consistent annotation.
77
+ </p>
78
+ </div>
79
+
80
+ <div class="card" style="grid-column: span 12;">
81
+ <h2>4) 📁 Save a GFF file</h2>
82
+ <div class="code"><pre><code class="language-python"># Save GFF3 file
83
+ short_model_name_match = re.search(r"[^/]+$", model_name)
84
+ short_model_name = short_model_name_match.group() if short_model_name_match else model_name
85
+
86
+ output_filename = f"{short_model_name}_{assembly}_{chrom}_{start}_{end}.gff3"
87
+ with open(output_filename, "w") as output_file:
88
+ output_file.write(gff_text)
89
+
90
+ print(f"Saved GFF file to {output_filename}")</code></pre></div>
91
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
92
+ <strong>Example output:</strong> Saved GFF file to NTv3_650M_pos_hg38_chr19_6700000_6831072.gff3
93
+ </p>
94
+ </div>
95
+
96
+ <div class="card" style="grid-column: span 12;">
97
+ <h2>5) 🌐 Create an IGV Browser</h2>
98
+ <div class="code"><pre><code class="language-python">import igv_notebook
99
+
100
+ igv_notebook.init()
101
+
102
+ config = {
103
+ "genome": "hg38", # built-in hg38
104
+ "locus": f"{chrom}:{start}-{end}",
105
+ }
106
+
107
+ gff_track = {
108
+ "name": "NTv3 annotations",
109
+ "format": "gff3",
110
+ "type": "annotation",
111
+ "url": output_filename, # just the filename
112
+ }
113
+
114
+ browser = igv_notebook.Browser(config)
115
+ browser.load_track(gff_track)
116
+
117
+ # Re-center on the region, just to be sure
118
+ browser.search(f"{chrom}:{start}-{end}")
119
+ browser # <- just return the object, no .show()</code></pre></div>
120
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
121
+ This creates an interactive IGV browser visualization of the annotations. The GFF file can also be visualized in any genome browser.
122
+ </p>
123
+ </div>
124
+
125
+ <div class="card" style="grid-column: span 12;">
126
+ <h2>📓 Full Notebook</h2>
127
+ <p>To view and run the complete notebook interactively:</p>
128
+ <ul>
129
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">View notebook on Hugging Face</a></li>
130
+ <li>Download and run in Jupyter, Google Colab, or any notebook environment</li>
131
+ </ul>
132
+ </div>
133
+ </div>
134
+
tabs/demo.html DELETED
@@ -1,88 +0,0 @@
1
- <div class="summary">
2
- <h2>💻 Interactive Code Demo</h2>
3
- <p>Run the NTv3 650M post-trained model interactively to predict functional tracks and genomic elements for any genomic region.</p>
4
- <p><strong>Model:</strong> <code>InstaDeepAI/NTv3_650M_pos</code></p>
5
- </div>
6
-
7
- <div class="grid">
8
- <div class="card" style="grid-column: span 12;">
9
- <h2>🚀 NTv3 Track Prediction Pipeline</h2>
10
- <p>Enter a genomic region to get predictions for functional tracks and genomic elements. The model will predict ~7k functional tracks and 21 genomic elements over the center 37.5% of your input region.</p>
11
-
12
- <!-- Gradio app embedded here -->
13
- <!-- Note: With Gradio SDK, the app.py serves as the main interface -->
14
- <!-- The HTML interface can still be accessed, but the Gradio demo is the primary interface -->
15
- <div id="gradio-container" style="margin-top: 20px; min-height: 600px;">
16
- <p style="color: var(--muted); margin-bottom: 15px;">
17
- <strong>Note:</strong> With Gradio SDK enabled, the interactive demo is now the main interface of this Space.
18
- You can interact with it directly, or use the code example below to run predictions programmatically.
19
- </p>
20
- <div style="background: rgba(0,0,0,0.3); padding: 20px; border-radius: 12px; border: 1px solid var(--border);">
21
- <p style="color: var(--link); margin: 0;">
22
- 💡 The Gradio interactive demo is now available as the main interface of this Space.
23
- Refresh the page to see it, or use the code example below.
24
- </p>
25
- </div>
26
- </div>
27
-
28
- <p style="margin-top: 20px; color: var(--muted); font-size: 13px;">
29
- <strong>Note:</strong> The first run may take longer as the model loads. Maximum region size: 1 Mb (1,000,000 base pairs).
30
- </p>
31
- </div>
32
-
33
- <div class="card" style="grid-column: span 12;">
34
- <h2>📝 Code Example</h2>
35
- <p>Here's the Python code that powers the demo above. You can run this in a notebook or Python script:</p>
36
- <div class="code"><pre><code class="language-python">from transformers import pipeline
37
- import torch
38
-
39
- model_name = "InstaDeepAI/NTv3_650M_pos"
40
-
41
- ntv3_tracks = pipeline(
42
- "ntv3-tracks",
43
- model=model_name,
44
- trust_remote_code=True,
45
- device=0 if torch.cuda.is_available() else -1,
46
- )
47
-
48
- # Run track prediction
49
- out = ntv3_tracks(
50
- {
51
- "chrom": "chr19",
52
- "start": 6_700_000,
53
- "end": 6_831_072,
54
- "species": "human"
55
- }
56
- )
57
-
58
- # Print output shapes
59
- # 7k human tracks over 37.5 % center region of the input sequence
60
- print("bigwig_tracks_logits:", tuple(out.bigwig_tracks_logits.shape))
61
- # Location of 21 genomic elements over 37.5 % center region of the input sequence
62
- print("bed_tracks_logits:", tuple(out.bed_tracks_logits.shape))
63
- # Language model logits for whole sequence over vocabulary
64
- print("language model logits:", tuple(out.mlm_logits.shape))</code></pre></div>
65
- <p style="margin-top: 15px;">To run the interactive Gradio app locally:</p>
66
- <div class="code"><pre><code class="language-bash">pip install -r requirements.txt
67
- python app.py</code></pre></div>
68
- </div>
69
- </div>
70
-
71
- <script>
72
- // Try to detect if Gradio app is available
73
- window.addEventListener('load', function() {
74
- const iframe = document.getElementById('gradio-iframe');
75
- iframe.onerror = function() {
76
- // If iframe fails to load, keep showing the instructions
77
- document.getElementById('gradio-loading').style.display = 'block';
78
- iframe.style.display = 'none';
79
- };
80
- // Set a timeout to show instructions if iframe doesn't load
81
- setTimeout(function() {
82
- if (iframe.style.display === 'none') {
83
- document.getElementById('gradio-loading').style.display = 'block';
84
- }
85
- }, 2000);
86
- });
87
- </script>
88
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tabs/functional_tracks.html ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="summary">
2
+ <h2>🧬 NTv3 Post-Trained Functional Track Prediction</h2>
3
+ <p>This notebook demonstrates how to use the NTv3 post-trained model to predict functional tracks and genome annotation directly from a DNA sequence.</p>
4
+ <p>The pipeline abstracts away all the underlying steps: running inference with the model and plotting the predictions per tracks.</p>
5
+ <p>If you're interested in exploring the intermediate probabilities, please refer to the <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_tutorials/01_tracks_prediction.ipynb" target="_blank" rel="noopener">track-prediction notebook</a>.</p>
6
+ <p><strong>📝 Note for Google Colab users:</strong> This notebook is compatible with Colab! For faster inference, make sure to enable GPU: Runtime → Change runtime type → GPU (T4 or better recommended).</p>
7
+ </div>
8
+
9
+ <div class="grid">
10
+ <div class="card" style="grid-column: span 12;">
11
+ <h2>0) 📦 Imports + setup</h2>
12
+ <p>Install dependencies:</p>
13
+ <div class="code"><pre><code class="language-bash">pip -q install "transformers>=4.55" "huggingface_hub>=0.23" safetensors torch pyfaidx requests seaborn matplotlib igv_notebook pyBigWig</code></pre></div>
14
+
15
+ <p style="margin-top: 20px;">Import required libraries:</p>
16
+ <div class="code"><pre><code class="language-python">import re
17
+ import time
18
+ import os
19
+ import torch
20
+ import requests
21
+ import numpy as np
22
+ import pyBigWig
23
+ from transformers import pipeline, AutoConfig</code></pre></div>
24
+ </div>
25
+
26
+ <div class="card" style="grid-column: span 12;">
27
+ <h2>1) 📦 Configuration</h2>
28
+ <p>Set your NTv3 model and genomic window here:</p>
29
+ <div class="code"><pre><code class="language-python"># Define the model and genomic window
30
+ model_name = "InstaDeepAI/NTv3_650M_pos"
31
+
32
+ species = "human" # will use for condition the model on species
33
+ assembly = "hg38" # will use for fetching the chromosome sequence
34
+
35
+ chrom = "chr19"
36
+ start = 6_700_000
37
+ end = 6_831_072</code></pre></div>
38
+ </div>
39
+
40
+ <div class="card" style="grid-column: span 12;">
41
+ <h2>2) 📥 Fetch chromosome sequence for the chosen window</h2>
42
+ <div class="code"><pre><code class="language-python"># Get the sequence from the UCSC API
43
+ url = f"https://api.genome.ucsc.edu/getData/sequence?genome={assembly};chrom={chrom};start={start};end={end}"
44
+ seq = requests.get(url).json()["dna"].upper()
45
+ print(f"Original sequence length: {len(seq)}")
46
+
47
+ # Crop to multiple of 128 (the pipeline will crop again, but this is a no-op once divisible)
48
+ seq = seq[:int(len(seq) // 128) * 128]
49
+ print(f"Cropped sequence length: {len(seq)}, {len(seq) / 128} transformer tokens")</code></pre></div>
50
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
51
+ <strong>Example output:</strong><br>
52
+ Original sequence length: 131072<br>
53
+ Cropped sequence length: 131072, 1024.0 transformer tokens
54
+ </p>
55
+ </div>
56
+
57
+ <div class="card" style="grid-column: span 12;">
58
+ <h2>3) ⚡ Functional track prediction pipeline (pre-processing, inference, plotting)</h2>
59
+ <div class="code"><pre><code class="language-python"># Build NTv3 tracks pipeline
60
+ ntv3_tracks = pipeline(
61
+ "ntv3-tracks",
62
+ model=model_name,
63
+ trust_remote_code=True,
64
+ device=0 if torch.cuda.is_available() else -1,
65
+ )
66
+
67
+ # Select tracks to plot
68
+ tracks_to_plot = {
69
+ "K562 RNA-seq": "ENCSR056HPM",
70
+ "K562 DNAse": "ENCSR921NMD",
71
+ "K562 H3k4me3": "ENCSR000DWD",
72
+ "K562 CTCF": "ENCSR000AKO",
73
+ "HepG2 RNA-seq": "ENCSR561FEE_P",
74
+ "HepG2 DNAse": "ENCSR000EJV",
75
+ "HepG2 H3k4me3": "ENCSR000AMP",
76
+ "HepG2 CTCF": "ENCSR000BIE",
77
+ }
78
+ elements_to_plot = ["protein_coding_gene", "exon", "intron", "splice_donor", "splice_acceptor"]
79
+
80
+ # Run pipeline: DNA -> NTv3 -> Tracks -> plot
81
+ start_time = time.time()
82
+
83
+ ntv3_predictions = ntv3_tracks(
84
+ {"chrom": "chr19", "start": 6_700_000, "end": 6_831_072, "species": species},
85
+ plot=True,
86
+ tracks_to_plot=tracks_to_plot,
87
+ elements_to_plot=elements_to_plot,
88
+ )
89
+
90
+ end_time = time.time()
91
+
92
+ print(f"Inference + decoding time: {end_time - start_time:.2f} seconds")</code></pre></div>
93
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
94
+ The pipeline performs all the necessary steps: running inference with the model and plotting the predictions for the specified tracks and genomic elements.
95
+ </p>
96
+ </div>
97
+
98
+ <div class="card" style="grid-column: span 12;">
99
+ <h2>4) 📁 Save as BigWig file</h2>
100
+ <div class="code"><pre><code class="language-python"># Load config to get track names and find indices for tracks_to_plot
101
+ cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
102
+ all_bigwig_names = cfg.bigwigs_per_file_assembly[assembly]
103
+
104
+ # Find indices of tracks we want to save
105
+ # Use display names (keys) for filenames, but track IDs (values) to find indices
106
+ track_data_list = [] # List of (display_name, track_id, index) tuples
107
+ for display_name, track_id in tracks_to_plot.items():
108
+ try:
109
+ idx = all_bigwig_names.index(track_id)
110
+ track_data_list.append((display_name, track_id, idx))
111
+ except ValueError:
112
+ print(f"Warning: Track '{track_id}' ({display_name}) not found in config. Skipping...")
113
+
114
+ print(f"Found {len(track_data_list)} tracks to save from tracks_to_plot")
115
+
116
+ # Get predictions (shape: (49152, 7362))
117
+ bigwig_logits = ntv3_predictions.bigwig_tracks_logits
118
+ if isinstance(bigwig_logits, torch.Tensor):
119
+ bigwig_logits = bigwig_logits.detach().cpu().numpy()
120
+
121
+ # Calculate genomic coordinates for the center 37.5% region
122
+ # The predictions cover the center 37.5% of the input sequence
123
+ input_length = end - start
124
+ center_start_offset = int(input_length * 0.3125) # (1 - 0.375) / 2 = 0.3125
125
+ center_length = int(input_length * 0.375)
126
+ center_start = start + center_start_offset
127
+ center_end = center_start + center_length
128
+
129
+ print(f"Input region: {chrom}:{start}-{end} (length: {input_length:,} bp)")
130
+ print(f"Prediction region: {chrom}:{center_start}-{center_end} (length: {center_length:,} bp)")
131
+ print(f"Number of positions: {bigwig_logits.shape[0]}")
132
+
133
+ # Create output directory
134
+ output_dir = "bigwig_outputs"
135
+ os.makedirs(output_dir, exist_ok=True)
136
+
137
+ # Save each track as a separate BigWig file
138
+ print(f"\nSaving BigWig files to '{output_dir}/' directory...")
139
+ for i, (display_name, track_id, track_idx) in enumerate(track_data_list):
140
+ # Get track data (logits for this track)
141
+ track_data = bigwig_logits[:, track_idx].astype(np.float32)
142
+
143
+ # Create BigWig file using display name (key) for filename
144
+ # Clean the display name for use as filename (replace spaces, special chars)
145
+ track_clean_name = display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
146
+ bw_filename = os.path.join(output_dir, f"{track_clean_name}.bw")
147
+ bw = pyBigWig.open(bw_filename, "w")
148
+
149
+ # Add header (chromosome and size)
150
+ bw.addHeader([(chrom, end)])
151
+
152
+ # Add entries (intervals with values)
153
+ # Each position in track_data corresponds to one base pair
154
+ starts = np.arange(center_start, center_start + len(track_data), dtype=np.int64)
155
+ ends = starts + 1
156
+ values = track_data.tolist()
157
+
158
+ bw.addEntries(
159
+ chroms=[chrom] * len(starts),
160
+ starts=starts.tolist(),
161
+ ends=ends.tolist(),
162
+ values=values
163
+ )
164
+
165
+ bw.close()
166
+
167
+ print(f" Saved {i + 1}/{len(track_data_list)}: {display_name} ({track_clean_name}.bw)")
168
+
169
+ print(f"\n✅ Successfully saved {len(track_data_list)} BigWig files to '{output_dir}/'")</code></pre></div>
170
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
171
+ This saves each selected functional track as a separate BigWig file that can be visualized in genome browsers. The files are saved with user-friendly display names (e.g., "K562_RNA_seq.bw").
172
+ </p>
173
+ </div>
174
+
175
+ <div class="card" style="grid-column: span 12;">
176
+ <h2>5) 🌐 Create an IGV Browser</h2>
177
+ <div class="code"><pre><code class="language-python">import igv_notebook
178
+
179
+ igv_notebook.init()
180
+
181
+ # Build tracks array with all BigWig files we saved
182
+ tracks = []
183
+ for track_display_name, track_id in tracks_to_plot.items():
184
+ # Clean the display name to match the filename we saved
185
+ track_clean_name = track_display_name.replace(" ", "_").replace("/", "_").replace("-", "_")
186
+ bigwig_path = os.path.join(output_dir, f"{track_clean_name}.bw")
187
+ bigwig_track = {
188
+ "name": track_display_name,
189
+ "format": "bigwig",
190
+ "url": bigwig_path,
191
+ "height": 70,
192
+ "autoscale": True,
193
+ "displayMode": "EXPANDED",
194
+ }
195
+ tracks.append(bigwig_track)
196
+
197
+ config = {
198
+ "genome": assembly,
199
+ "locus": f"{chrom}:{center_start}-{center_end}",
200
+ "tracks": tracks,
201
+ "theme": "dark",
202
+ }
203
+
204
+ browser = igv_notebook.Browser(config)
205
+ browser # <- just return the object, no .show()</code></pre></div>
206
+ <p style="margin-top: 15px; color: var(--muted); font-size: 13px;">
207
+ This creates an interactive IGV browser visualization with a dark theme showing all the predicted functional tracks. The BigWig files can also be visualized in any genome browser.
208
+ </p>
209
+ </div>
210
+
211
+ <div class="card" style="grid-column: span 12;">
212
+ <h2>📓 Full Notebook</h2>
213
+ <p>To view and run the complete notebook interactively:</p>
214
+ <ul>
215
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener">View notebook on Hugging Face</a></li>
216
+ <li>Download and run in Jupyter, Google Colab, or any notebook environment</li>
217
+ </ul>
218
+ </div>
219
+ </div>
220
+
tabs/home.html CHANGED
@@ -92,7 +92,7 @@
92
  <div class="card">
93
  <h2>📓 Pipelines notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener">folder</a>)</h2>
94
  <ul>
95
- <li> 🎯 01 — Generate bigwig predictions for certain tracks</li>
96
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li>
97
  <li>🎯 03 — Fine-tune on bigwig tracks</li>
98
  <li>🔍 04 — Interpret a given genomic region</li>
 
92
  <div class="card">
93
  <h2>📓 Pipelines notebooks (browse <a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/tree/main/notebooks_pipelines" target="_blank" rel="noopener">folder</a>)</h2>
94
  <ul>
95
+ <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/01_functional_track_prediction.ipynb" target="_blank" rel="noopener">🎯 01 — Generate bigwig predictions for certain tracks</a></li>
96
  <li><a href="https://huggingface.co/spaces/InstaDeepAI/ntv3/blob/main/notebooks_pipelines/02_genome_annotation.ipynb" target="_blank" rel="noopener">🏷️ 02 — Genome annotation / segmentation</a></li>
97
  <li>🎯 03 — Fine-tune on bigwig tracks</li>
98
  <li>🔍 04 — Interpret a given genomic region</li>