Querying
Using SQL
To query the parsomics
database using SQL, feel free to use your SQL client of
choice. Make sure you follow check out the information at Database
credentials.
Again, the parsomics
team recommends using DBeaver. Not only only does it
improve the readability of query results using tables, but also allows its
users to save their SQL queries for later usage. Check out the section on
Setting up DBeaver for a full
tutorial on using DBeaver with your local parsomics
database.
Using Python
You can query the parsomics
database with Python, using SQLModel, the
Object–Relational Mapping (ORM) library that powers parsomics-core
.
SQLModel cleverly uses Python's typing system to create "pythonic" abstractions
for interacting with relational databases.
One of the best features of SQLModel when it comes to querying is that it can infer which attributes are used for joining tables, when not ambiguous. You can read more about it in SQLModel's documentation.
Using the REST API
Finally, you can also query the parsomics
database using the querying
endpoint of the API. However, beware that this endpoint only supports queries
with SELECT
, JOIN
, and WHERE
statements .
Make sure you follow the instructions for starting the REST API server before you proceed.
Certainly. Here's a concise and clear API spec you can include in your project documentation:
Examples
Get the name and the sequence of a certain contig
In this example, 42
is the primary key of the contig.
- SQL
- Python
- REST API
SELECT FASTAEntry.sequence_name, FASTAEntry.sequence
FROM FASTAEntry
JOIN Contig ON Contig.fasta_entry_key = FASTAEntry.key
WHERE Contig.key = 42;
from sqlmodel import Session, select
from parsomics_core.entities import FASTAEntry, Contig
from parsomics_core.globals.database import engine
with Session(engine) as session:
statement = (
select(FASTAEntry.sequence_name, FASTAEntry.sequence)
.join(Contig, Contig.fasta_entry_key == FASTAEntry.key)
.where(Contig.key == 42)
)
result = session.exec(statement).all()
print(result)
selects: [
{mdl: "FASTAEntry", attr: "sequence_name"},
{mdl: "FASTAEntry", attr: "sequence"}
],
joins: [
{
mdl: "Contig",
onclause: {
mdl_l: "Contig",
attr_l: "fasta_entry_key",
mdl_r: "FASTAEntry",
attr_r: "key"
},
isouter: false,
full: false
}
],
wheres: [{mdl: "Contig", attr: "key", op: "==", val: 42}]
}
Get all protein annotations produced by CLEAN
In this example, clean
is the name of the tool.
- SQL
- Python
- REST API
SELECT proteinannotationentry.*
FROM proteinannotationentry
JOIN proteinannotationfile ON proteinannotationentry.file_key = proteinannotationfile.key
JOIN run ON proteinannotationfile.run_key = run.key
JOIN tool ON run.tool_key = tool.key
WHERE tool.name = 'clean'
from sqlmodel import Session, select
from parsomics_core.entities import (
ProteinAnnotationEntry,
ProteinAnnotationFile,
Run,
Tool,
)
from parsomics_core.globals.database import engine
with Session(engine) as session:
statement = (
select(ProteinAnnotationEntry)
.join(ProteinAnnotationFile)
.join(Run)
.join(Tool)
.where(Tool.name == "clean")
)
result = session.exec(statement).all()
return result
print(result)
{
"selects": [
{"mdl": "ProteinAnnotationEntry", "attr": "key"},
{"mdl": "ProteinAnnotationEntry", "attr": "description"},
{"mdl": "ProteinAnnotationEntry", "attr": "coord_start"},
{"mdl": "ProteinAnnotationEntry", "attr": "coord_stop"},
{"mdl": "ProteinAnnotationEntry", "attr": "accession"},
{"mdl": "ProteinAnnotationEntry", "attr": "score"},
{"mdl": "ProteinAnnotationEntry", "attr": "annotation_type"},
{"mdl": "ProteinAnnotationEntry", "attr": "details"},
{"mdl": "ProteinAnnotationEntry", "attr": "protein_key"},
{"mdl": "ProteinAnnotationEntry", "attr": "file_key"},
{"mdl": "ProteinAnnotationEntry", "attr": "source_key"}
],
"joins": [
{
"mdl": "ProteinAnnotationFile",
"onclause": {
"mdl_l": "ProteinAnnotationEntry",
"attr_l": "file_key",
"mdl_r": "ProteinAnnotationFile",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Run",
"onclause": {
"mdl_l": "ProteinAnnotationFile",
"attr_l": "run_key",
"mdl_r": "Run",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Tool",
"onclause": {
"mdl_l": "Run",
"attr_l": "tool_key",
"mdl_r": "Tool",
"attr_r": "key"
},
"isouter": false,
"full": false
}
],
"wheres": [
{"mdl": "Tool", "attr": "name", "op": "==", "val": "clean"}
]
}
Get the name of the sample on which a gene was found
In this example, 7
is the primary key of the gene.
- SQL
- Python
- REST API
SELECT sample.name
FROM sample
JOIN genome ON genome.sample_key = sample.key
JOIN contig ON contig.genome_key = genome.key
JOIN gene ON gene.contig_key = contig.key
WHERE gene.key = 7
from sqlmodel import Session, select
from parsomics_core.entities import (
Sample,
Genome,
Contig,
Gene,
)
from parsomics_core.globals.database import engine
with Session(engine) as session:
statement = (
select(Sample.name)
.join(Genome)
.join(Contig)
.join(Gene)
.where(
Gene.key == 7,
)
)
result = session.exec(statement).all()
print(result)
{
"selects": [
{ "mdl": "Sample", "attr": "name" }
],
"joins": [
{
"mdl": "Genome",
"onclause": {
"mdl_l": "Genome",
"attr_l": "sample_key",
"mdl_r": "Sample",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Contig",
"onclause": {
"mdl_l": "Contig",
"attr_l": "genome_key",
"mdl_r": "Genome",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Gene",
"onclause": {
"mdl_l": "Gene",
"attr_l": "contig_key",
"mdl_r": "Contig",
"attr_r": "key"
},
"isouter": false,
"full": false
}
],
"wheres": [
{
"mdl": "Gene",
"attr": "key",
"op": "==",
"val": 7
}
]
}
Get the project that a contig belongs to
In this example, 66
is the primary key of the contig.
- SQL
- Python
- REST API
SELECT project.*
FROM project
JOIN assembly ON assembly.project_key = project.key
JOIN run ON run.assembly_key = assembly.key
JOIN fastafile ON fastafile.run_key = run.key
JOIN fastaentry ON fastaentry.file_key = fastafile.key
JOIN contig ON contig.fasta_entry_key = fastaentry.key
WHERE contig.key = 66
from sqlmodel import Session, select
from parsomics_core.entities import (
Project,
Assembly,
Run,
FASTAFile,
FASTAEntry,
Contig,
)
from parsomics_core.globals.database import engine
with Session(engine) as session:
statement = (
select(Project)
.join(Assembly)
.join(Run)
.join(FASTAFile)
.join(FASTAEntry)
.join(Contig)
.where(Contig.key == 66)
)
result = session.exec(statement).all()
print(result)
{
"selects": [
{"mdl": "Project", "attr": "key"},
{"mdl": "Project", "attr": "created_at"},
{"mdl": "Project", "attr": "updated_at"},
{"mdl": "Project", "attr": "status"},
{"mdl": "Project", "attr": "name"}
],
"joins": [
{
"mdl": "Assembly",
"onclause": {
"mdl_l": "Assembly",
"attr_l": "project_key",
"mdl_r": "Project",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Run",
"onclause": {
"mdl_l": "Run",
"attr_l": "assembly_key",
"mdl_r": "Assembly",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "FASTAFile",
"onclause": {
"mdl_l": "FASTAFile",
"attr_l": "run_key",
"mdl_r": "Run",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "FASTAEntry",
"onclause": {
"mdl_l": "FASTAEntry",
"attr_l": "file_key",
"mdl_r": "FASTAFile",
"attr_r": "key"
},
"isouter": false,
"full": false
},
{
"mdl": "Contig",
"onclause": {
"mdl_l": "Contig",
"attr_l": "fasta_entry_key",
"mdl_r": "FASTAEntry",
"attr_r": "key"
},
"isouter": false,
"full": false
}
],
"wheres": [
{
"mdl": "Contig",
"attr": "key",
"op": "==",
"val": 66
}
]
}
Get detailed information on all protein annotations of a contig
In this example, 13
is the primary key of the contig.
- SQL
- Python
- REST API
SELECT
proteinannotationentry.key,
proteinannotationentry.description,
proteinannotationentry.coord_start as annotation_coord_start,
proteinannotationentry.coord_stop as annotation_coord_stop,
gffentry.coord_start as cds_coord_start,
gffentry.coord_stop as cds_coord_stop,
tool.key as tool_key,
tool.name as tool_name,
source.key as source_key,
source.name as source_name
FROM proteinannotationentry
JOIN protein ON proteinannotationentry.protein_key = protein.key
JOIN fragmentproteinlink ON protein.key = fragmentproteinlink.protein_key
JOIN fragment ON fragmentproteinlink.fragment_key = fragment.key
JOIN gene ON fragment.gene_key = gene.key
JOIN gffentry ON fragment.gff_entry_key = gffentry.key
LEFT JOIN source ON proteinannotationentry.source_key = source.key
JOIN proteinannotationfile ON proteinannotationentry.file_key = proteinannotationfile.key
JOIN run ON proteinannotationfile.run_key = run.key
JOIN tool ON run.tool_key = tool.key
WHERE
gene.contig_key = 13 AND gffentry.fragment_type = 'CDS'
from sqlmodel import Session, select
from parsomics_core.entities import (
ProteinAnnotationEntry,
Protein,
FragmentProteinLink,
Fragment,
Gene,
GFFEntry,
Source,
ProteinAnnotationFile,
Run,
Tool
)
from parsomics_core.globals.database import engine
with Session(engine) as session:
statement = (
select(
ProteinAnnotationEntry.key,
ProteinAnnotationEntry.description,
ProteinAnnotationEntry.coord_start.label("annotation_coord_start"),
ProteinAnnotationEntry.coord_stop.label("annotation_coord_stop"),
GFFEntry.coord_start.label("cds_coord_start"),
GFFEntry.coord_stop.label("cds_coord_stop"),
Tool.key.label("tool_key"),
Tool.name.label("tool_name"),
Source.key.label("source_key"),
Source.name.label("source_name")
)
.join(Protein, ProteinAnnotationEntry.protein_key == Protein.key)
.join(FragmentProteinLink, Protein.key == FragmentProteinLink.protein_key)
.join(Fragment, FragmentProteinLink.fragment_key == Fragment.key)
.join(Gene, Fragment.gene_key == Gene.key)
.join(GFFEntry, Fragment.gff_entry_key == GFFEntry.key)
.outerjoin(Source, ProteinAnnotationEntry.source_key == Source.key)
.join(ProteinAnnotationFile, ProteinAnnotationEntry.file_key == ProteinAnnotationFile.key)
.join(Run, ProteinAnnotationFile.run_key == Run.key)
.join(Tool, Run.tool_key == Tool.key)
.where(
Gene.contig_key == 13,
GFFEntry.fragment_type == "CDS"
)
)
result = session.exec(statement).all()
print(result)
{
selects: [
{mdl: "ProteinAnnotationEntry", attr: "key"},
{mdl: "ProteinAnnotationEntry", attr: "description"},
{mdl: "ProteinAnnotationEntry", attr: "coord_start"},
{mdl: "ProteinAnnotationEntry", attr: "coord_stop"},
{mdl: "GFFEntry", attr: "coord_start"},
{mdl: "GFFEntry", attr: "coord_stop"},
{mdl: "Tool", attr: "key"},
{mdl: "Tool", attr: "name"},
{mdl: "Source", attr: "key"},
{mdl: "Source", attr: "name"},
],
joins: [
{
mdl: "Protein",
onclause: {
mdl_l: "ProteinAnnotationEntry",
attr_l: "protein_key",
mdl_r: "Protein",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "FragmentProteinLink",
onclause: {
mdl_l: "Protein",
attr_l: "key",
mdl_r: "FragmentProteinLink",
attr_r: "protein_key"
},
isouter: false,
full: false
},
{
mdl: "Fragment",
onclause: {
mdl_l: "FragmentProteinLink",
attr_l: "fragment_key",
mdl_r: "Fragment",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "Gene",
onclause: {
mdl_l: "Fragment",
attr_l: "gene_key",
mdl_r: "Gene",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "GFFEntry",
onclause: {
mdl_l: "Fragment",
attr_l: "gff_entry_key",
mdl_r: "GFFEntry",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "Source",
onclause: {
mdl_l: "ProteinAnnotationEntry",
attr_l: "source_key",
mdl_r: "Source",
attr_r: "key"
},
isouter: true,
full: false
},
{
mdl: "ProteinAnnotationFile",
onclause: {
mdl_l: "ProteinAnnotationEntry",
attr_l: "file_key",
mdl_r: "ProteinAnnotationFile",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "Run",
onclause: {
mdl_l: "ProteinAnnotationFile",
attr_l: "run_key",
mdl_r: "Run",
attr_r: "key"
},
isouter: false,
full: false
},
{
mdl: "Tool",
onclause: {
mdl_l: "Run",
attr_l: "tool_key",
mdl_r: "Tool",
attr_r: "key"
},
isouter: false,
full: false
},
],
wheres: [
{mdl: "Gene", attr: "contig_key", op: "==", val: contigKey},
{mdl: "GFFEntry", attr: "fragment_type", op: "==", val: "CDS"},
]
}