Amazon CloudSearch spike project
Gist: https://gist.github.com/nanvel/4f7696174ac3a9b3554c
"""
Search bebop series.
"""
import arrow
import json
from tornado import options
from tornado.httpclient import HTTPError, HTTPClient, HTTPRequest
from tornado_botocore import Botocore
from tvs import TVS
DOMAIN_NAME = 'test-bebop-domain'
API_VERSION = '2013-01-01'
if __name__ == '__main__':
options.parse_command_line()
# create domain
cs_create_domain = Botocore(
service='cloudsearch', operation='CreateDomain',
region_name='us-west-2')
session = cs_create_domain.session
try:
# create domain, domain will be reused if already exists
print cs_create_domain.call(domain_name=DOMAIN_NAME)
# {
# "DomainStatus":{
# "DomainId":"240020657974/test-bebop-domain",
# "Created":true,
# "SearchService":{},
# "SearchInstanceCount":0,
# "DomainName":"test-bebop-domain",
# "DocService":{},
# "Deleted":false,
# "Processing":false,
# "RequiresIndexDocuments":false,
# "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain",
# "SearchPartitionCount":0
# },
# "ResponseMetadata":{
# "RequestId":"38b0cba7-60f2-11e4-980e-6d6976ea3108"
# }
# }
except HTTPError as e:
print e.response.body
# configure fields
cs_define_index_field = Botocore(
service='cloudsearch', operation='DefineIndexField',
region_name='us-west-2', session=session)
# Fields:
# - title - text + show in result
# - airdate - uint
# - genre - literal + facet enabled (or literal-array?)
# - content - text
FIELDS = [{
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'title',
'IndexFieldType': 'text',
'TextOptions': {
'HighlightEnabled': False,
'DefaultValue': 'untitled',
'ReturnEnabled': True,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'content',
'IndexFieldType': 'text',
'TextOptions': {
'HighlightEnabled': False,
'DefaultValue': '',
'ReturnEnabled': False,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'airdate',
'IndexFieldType': 'int',
'IntOptions': {
'DefaultValue': 946684800,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'genre',
'IndexFieldType': 'literal-array',
'LiteralArrayOptions': {
'DefaultValue': '',
'FacetEnabled': True,
'ReturnEnabled': False,
'SearchEnabled': True,
}
}
}]
try:
for params in FIELDS:
print cs_define_index_field.call(**params)
except HTTPError as e:
print e.response.body
# add data
batch = []
for tv in TVS:
batch.append({
'type': 'add', 'id': tv['number'],
'fields': {
'title': tv['title'],
'content': tv['content'],
'airdate': arrow.get(tv['airdate'], ['YYYY-MM-DD', 'MMMM D, YYYY']).timestamp,
'genre': tv['genre'],
}
})
# get document and search endpoints
cs_describe_domains = Botocore(
service='cloudsearch', operation='DescribeDomains',
region_name='us-west-2', session=session)
response = cs_describe_domains.call(domain_names=[DOMAIN_NAME])
# {
# "DomainStatusList":[
# {
# "DomainId":"240020657974/test-bebop-domain",
# "Created":true,
# "SearchService":{
# "Endpoint":"search-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com"
# },
# "SearchInstanceCount":1,
# "DomainName":"test-bebop-domain",
# "DocService":{
# "Endpoint":"doc-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com"
# },
# "SearchInstanceType":"search.m1.small",
# "Deleted":false,
# "Processing":false,
# "RequiresIndexDocuments":true,
# "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain",
# "SearchPartitionCount":1
# }
# ],
# "ResponseMetadata":{
# "RequestId":"7993ac9b-6101-11e4-8510-8ffcccb94f21"
# }
# }
search_endpoint = response['DomainStatusList'][0]['SearchService']['Endpoint']
document_endpoint = response['DomainStatusList'][0]['DocService']['Endpoint']
httpclient = HTTPClient()
# reindex
cs_index_documents = Botocore(
service='cloudsearch', operation='IndexDocuments',
region_name='us-west-2', session=session)
print cs_index_documents.call(domain_name=DOMAIN_NAME)
# wait unil reindex complete
# add documents
url = 'http://{document_endpoint}/{api_version}/documents/batch'.format(
document_endpoint=document_endpoint,
api_version=API_VERSION)
try:
request = HTTPRequest(
url=url, body=json.dumps(batch),
headers={'Content-Type': 'application/json'}, method='POST')
request.params = None
cs_describe_domains.endpoint.auth.add_auth(request=request)
response = httpclient.fetch(request=request)
print response.body
except HTTPError as e:
print e.response.body
# search
url = 'http://{search_endpoint}/{api_version}/search?q=bebop'.format(
search_endpoint=search_endpoint, api_version=API_VERSION)
request = HTTPRequest(
url=url, headers={'Content-Type': 'application/json'},
method='GET')
request.params = None
cs_describe_domains.endpoint.auth.add_auth(request=request)
response = httpclient.fetch(request=request)
print response.body
# {
# "status":{
# "rid":"st/UtJYpAAoghec=",
# "time-ms":82
# },
# "hits":{
# "found":12,
# "start":0,
# "hit":[
# {
# "id":"3",
# "fields":{
# "airdate":"910396800",
# "title":"Honky Tonk Women"
# }
# },
# {
# "id":"18",
# "fields":{
# "airdate":"920073600",
# "title":"Speak Like a Child"
# }
# },
# ...
# ]
# }
# }
Licensed under CC BY-SA 3.0