Skip to content

Example Queries

annekroon edited this page Apr 2, 2019 · 6 revisions

Get all articles from multiple specified outlets in a given time range

  • listofdoctypes is a list of doctypes
  • fromdate and todate are strings of the form '2011-01-31'
query = {
    "query": {
     "bool": {
        "filter": [ {'bool': {'should': [{ "match": { "doctype": d}} for d in listofdoctypes]}},
      { "range": { "publication_date": { "gte": fromdate, "lt":todate }}}

The same, but only if one of several phrases is occuring in the headline:

from inca import Inca
myinca = Inca()

fromdate = '2015-01-01'
todate = '2018-01-02'
listofdoctypes = ['nu','nrc (www)','volkskrant (www)']

q = {"query": {
        "bool": {
           "must": [
              {"query_string" : {
              "fields" : ["title", "title_rss"],
              "query" : '"Europese Commissie" OR "Europese Centrale Bank"',
              "use_dis_max" : 'true'}}],
        "filter": [ {'bool': {'should': [{ "match": { "doctype": d}} for d in listofdoctypes]}},
      { "range": { "publication_date": { "gte": fromdate, "lt":todate }}}]}

g = myinca.database.document_generator(q)

first = next(g)

Similar, but more sophisticated:

from inca import Inca
myinca = Inca()

fromdate = '2017-09-13'
todate = '2018-01-15'
listofdoctypes = ['nu','nrc (www)','volkskrant (www)', 'telegraaf (www)', 'ad (www)', 'trouw (www)', 'nos (www)', 'geenstijl (www)', 'parool (www)', 'metro (www)']

exportfields = ['title','title_rss','doctype', 'publication_date', 'byline', 'teaser_rss', 'url',  'text']

# simpele variant
querystring = '"Europese Raad" OR "Europese Commissie" OR "Europese Centrale Bank" OR "ECB" OR "Europese Parlement" OR "Europaparlement" OR "EU" OR "Europese Unie" OR "Europese Commissie" OR "Eurozone" OR "Eurogroep"'

# betere variant
querystring2 = '("Europese Raad" OR "Europese Commissie" OR "Europese Centrale Bank" OR "ECB" OR "Europese Parlement" OR "Europaparlement" OR "EU" OR "Europese Unie" OR "Europese Commissie" OR "Eurozone" OR "Eurogroep") OR (Brussel  AND (text:"EU" or text:"Europese Unie" or text:"Europese Commissie" or text:"Europees Parlement" or text:"Europese Parlement" or text:"Europaparlement" or text:"Europese Centrale Bank"))'

q = {"query": {
        "bool": {
           "must": [
              {"query_string" : {
              "fields" : ["title", "title_rss"],
              "query" : querystring2}}],
        "filter": [ {'bool': {'should': [{ "match": { "doctype.keyword": d}} for d in listofdoctypes]}},
      { "range": { "publication_date": { "gte": fromdate, "lt":todate }}}]}

# g = myinca.database.document_generator(q)
# first = next(g)

myinca.importers_exporters.export_csv(query = q, fields = exportfields)