Example Queries
In [1]:
Copied!
import pyarrow.dataset as ds
from graphique import GraphQL
def execute(query):
result = app.schema.execute_sync(query, root_value=app.root_value, context_value={})
for error in result.errors or []:
raise ValueError(error)
return result.data
format = ds.ParquetFileFormat(read_options={'dictionary_columns': ['state']})
dataset = ds.dataset('../tests/fixtures/zipcodes.parquet', format=format)
app = GraphQL(dataset)
import pyarrow.dataset as ds
from graphique import GraphQL
def execute(query):
result = app.schema.execute_sync(query, root_value=app.root_value, context_value={})
for error in result.errors or []:
raise ValueError(error)
return result.data
format = ds.ParquetFileFormat(read_options={'dictionary_columns': ['state']})
dataset = ds.dataset('../tests/fixtures/zipcodes.parquet', format=format)
app = GraphQL(dataset)
Introspect the dataset.¶
In [2]:
Copied!
execute("""{
length
schema {
names
types
partitioning
index
}
}""")
execute("""{
length
schema {
names
types
partitioning
index
}
}""")
Out[2]:
{'length': 41700, 'schema': {'names': ['latitude', 'longitude', 'state', 'city', 'county', 'zipcode'], 'types': ['double', 'double', 'dictionary<values=string, indices=int32, ordered=0>', 'string', 'string', 'int32'], 'partitioning': [], 'index': ['zipcode']}}
Loading options¶
- Scanner with camel-cased fields (not relevant in this dataset)
- Table already read
In [3]:
Copied!
import pyarrow.compute as pc
from strawberry.utils.str_converters import to_camel_case
columns = {to_camel_case(name): pc.field(name) for name in dataset.schema.names}
GraphQL(dataset.scanner(columns=columns))
import pyarrow.compute as pc
from strawberry.utils.str_converters import to_camel_case
columns = {to_camel_case(name): pc.field(name) for name in dataset.schema.names}
GraphQL(dataset.scanner(columns=columns))
Out[3]:
<graphique.middleware.GraphQL at 0x7f76ebabd430>
In [4]:
Copied!
GraphQL(dataset.to_table())
GraphQL(dataset.to_table())
Out[4]:
<graphique.middleware.GraphQL at 0x7f77281c3dd0>
Find California counties with the most cities.¶
filter
state by "CA"group
by county- aggregate distinct count of cities
sort
by city counts descending- access
columns
county
is still known in the schema- cities is a new
column
accessed through an inline fragment
In [5]:
Copied!
execute("""{
filter(state: {eq: "CA"}) {
group(by: "county", aggregate: {countDistinct: {name: "city", alias: "cities"}}) {
sort(by: "-cities", length: 5) {
columns {
county {
values
}
}
cities: column(name: "cities") {
... on LongColumn {
values
}
}
}
}
}
}""")
execute("""{
filter(state: {eq: "CA"}) {
group(by: "county", aggregate: {countDistinct: {name: "city", alias: "cities"}}) {
sort(by: "-cities", length: 5) {
columns {
county {
values
}
}
cities: column(name: "cities") {
... on LongColumn {
values
}
}
}
}
}
}""")
Out[5]:
{'filter': {'group': {'sort': {'columns': {'county': {'values': ['Los Angeles', 'San Bernardino', 'San Diego', 'Orange', 'Riverside']}}, 'cities': {'values': [132, 73, 52, 47, 42]}}}}}
Find states with cities which match the name of their county.¶
scan
instead offilter
, because comparing two columns is not a "simple" queryColumn.unique
instead ofgroup
, because no other aggregates are needed
In [6]:
Copied!
execute("""{
scan(filter: {eq: [{name: "county"}, {name: "city"}]}) {
columns {
state {
unique {
length
values
}
}
}
}
}""")
execute("""{
scan(filter: {eq: [{name: "county"}, {name: "city"}]}) {
columns {
state {
unique {
length
values
}
}
}
}
}""")
Out[6]:
{'scan': {'columns': {'state': {'unique': {'length': 50, 'values': ['PR', 'MA', 'RI', 'NH', 'ME', 'VT', 'CT', 'NJ', 'NY', 'PA', 'DE', 'MD', 'VA', 'WV', 'NC', 'SC', 'FL', 'AL', 'TN', 'MS', 'KY', 'OH', 'IN', 'MI', 'IA', 'WI', 'MN', 'SD', 'ND', 'MT', 'IL', 'MO', 'KS', 'NE', 'LA', 'AR', 'OK', 'TX', 'CO', 'WY', 'ID', 'UT', 'AZ', 'NM', 'NV', 'CA', 'HI', 'OR', 'WA', 'AK']}}}}}
States which have no cities which match the name of their county.¶
The opposite of the previous example. Filtering rows would drop needed data; the "zeros" have to be counted.
scan
with projected column matching names instead of filteringgroup
by state- aggregate whether there are
any
matches
- aggregate whether there are
scan
for no matches- access column
In [7]:
Copied!
execute("""{
scan(columns: {alias: "match", eq: [{name: "county"}, {name: "city"}]}) {
group(by: "state", aggregate: {any: {name: "match"}}) {
scan(filter: {inv: {name: "match"}}) {
columns {
state {
values
}
}
}
}
}
}""")
execute("""{
scan(columns: {alias: "match", eq: [{name: "county"}, {name: "city"}]}) {
group(by: "state", aggregate: {any: {name: "match"}}) {
scan(filter: {inv: {name: "match"}}) {
columns {
state {
values
}
}
}
}
}
}""")
Out[7]:
{'scan': {'group': {'scan': {'columns': {'state': {'values': ['DC', 'GA']}}}}}}