-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser_histogram.py
More file actions
72 lines (63 loc) · 2.14 KB
/
parser_histogram.py
File metadata and controls
72 lines (63 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import solr
from math import log
from PIL import Image
import requests
from StringIO import StringIO
import threading
BATCH_SIZE = 100
def get_histogram_dispersion(histogram):
log2 = lambda x:log(x)/log(2)
total = len(histogram)
counts = {}
for item in histogram:
counts.setdefault(item,0)
counts[item]+=1
ent = 0
for i in counts:
p = float(counts[i])/total
ent-=p*log2(p)
return -ent*log2(1/ent)
def get_bad_images(image_url, doc_id):
#print 'get_bad_images ', image_url, doc_id
response = requests.get(best_match_image_url)
img = Image.open(StringIO(response.content))
#img = Image.open('/Users/saurabhjain/imagification/images/imagenotfound3.jpg')
hist = img.histogram()
dispersion = get_histogram_dispersion(hist)
if dispersion < 3.5:
# blank image
print doc_id, best_match_image_url, dispersion
s = solr.SolrConnection('http://solr-prod.s-9.us:8983/solr/shoprunner')
start = 0
iteration = -1
while True:
iteration += 1
start = iteration * BATCH_SIZE
results = s.query('*:*', fields=['id', 'image_url'], rows=BATCH_SIZE, start=start).results
image_sets = [(x['id'], x['image_url']) for x in results]
count = 0
workers = []
print len(image_sets)
for doc_id, image_set in image_sets:
count += 1
# has all resolutions. we pick the biggest one for best match (hopefully?)
best_match_image_url = None
for image in image_set:
if image.startswith('original|'):
best_match_image_url = image[9:]
break
if not best_match_image_url:
continue
workers.append(
threading.Thread(target=get_bad_images, args=(best_match_image_url, doc_id), name='get_%s' % str(doc_id))
)
if count % BATCH_SIZE == 0:
print '%s processed' % count
for worker in workers:
try:
worker.start()
except Exception as ex:
print doc_id, best_match_image_url, ex
for worker in workers:
worker.join()
workers = []