-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsearch.py
More file actions
170 lines (151 loc) · 6.1 KB
/
search.py
File metadata and controls
170 lines (151 loc) · 6.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import pymysql.cursors
import datetime
import traceback
import json
from elasticsearch import Elasticsearch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from posts import getRange
es = Elasticsearch()
import ConfigParser
config = ConfigParser.ConfigParser()
config.read('db.cfg')
connection = pymysql.connect(host=config.get('database','host'),
user=config.get('database','username'),
password=config.get('database','password'),
db = config.get('database','db'),
charset = 'utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
postIds = []
javaKeywords = """abstract continue for new switch
assert default goto package synchronized
boolean do if private this
break double implements protected throw
byte else import public throws
case enum instanceof return transient
catch extends int short try
char final interface static void
class finally long strictfp volatile
const* float native super while""".split()
# Method to remove stopwords and lemmatize the words in given text.
def stopWordsAndLemmatize(text):
wordnet_lemmatizer = WordNetLemmatizer()
stopWordsSet = set(stopwords.words('english'))
# Updated punctuations because punctuations could be important for code.
stopWordsSet.update(set(['.', ',', '"', "'", '?', '!',
':', ';', '(', ')', '[', ']', '{', '}']))
stopWordsSet = stopWordsSet - set(javaKeywords)
newText = " ".join([wordnet_lemmatizer.lemmatize(word.lower())
for word in text.split()
if word.lower() not in stopWordsSet])
return newText
#Method to index Posts table based on Body, Title and Tag fields.
def indexPosts():
try:
with connection.cursor() as cursor:
query = "select Id, PostTypeId, Body, Title, Tags from `Posts`"
cursor.execute(query)
c = 1
for row in cursor:
print "Row: ", c
c += 1
for key in row:
row[str(key)] = str(row.pop(key))
# Remove stopwords and lemmatize words in Body and Title
for key in ('Body', 'Title'):
row[key] = stopWordsAndLemmatize(row[key])
# The ID of each index is the ID of the post in the Posts table
docId = row.pop('Id')
es.index(index="posts_index", doc_type="posts_table",
id = docId, body=row)
except Exception, e:
print traceback.print_exc()
def fetchResults(pageId):
questionIds = []
data = []
endIndex = int(pageId) * 10
startIndex = endIndex - 10
try:
with connection.cursor() as cursor:
# If the ID belongs to a question, add to the result as is. If it
# belongs to an answer, find its question ID and add to the result.
for postId in postIds[startIndex:endIndex]:
sqlPostId = "SELECT P.Id, P.Title, P.ViewCount, P.Usefulness, P.OwnerUserId, P.OwnerDisplayName, P.FavouriteCount, P.Tags, \
P.AnswerCount, P.CreationDate, P.PostTypeId from Posts as P where P.Id = %s"
rowCount = cursor.execute(sqlPostId, postId)
if rowCount > 0:
postTypes = cursor.fetchall()
for row in postTypes:
if row['PostTypeId'] == 1:
questionIds.append(row)
elif row['PostTypeId'] == 2:
sqlParent = "SELECT P.Id, P.Title, P.ViewCount, P.Usefulness, P.OwnerUserId, P.OwnerDisplayName, P.FavouriteCount, P.Tags, \
P.AnswerCount, P.CreationDate, P.PostTypeId from Posts as P where P.Id in (SELECT `ParentId` from `Posts` where `Id` = %s) LIMIT 1"
parentRowCount = cursor.execute(sqlParent, postId)
if parentRowCount > 0:
parents = cursor.fetchall()
for record in parents:
# The parent ID of this answer could already be in
# the list of hits, if the question itself was
# a hit for this query. So skip if present in list.
if record not in questionIds:
questionIds.append(record)
viewCounts = []
usefulnessCounts = []
for row in questionIds:
viewCounts.append(int(row[u'ViewCount']))
usefulnessCounts.append(int(row[u'Usefulness']))
viewCounts.sort()
usefulnessCounts.sort()
splitAt = len(questionIds) / 3
v1 = viewCounts[:splitAt]
v2 = viewCounts[splitAt:splitAt*2]
v3 = viewCounts[splitAt*2:]
u1 = usefulnessCounts[:splitAt]
u2 = usefulnessCounts[splitAt:splitAt*2]
u3 = usefulnessCounts[splitAt*2:]
for row in questionIds:
id = row[u'Id']
sqlVup = "SELECT count(Id) as count from Votes where VoteTypeId = 2 and PostId = %s"
sqlVdown = "SELECT count(Id) as count from Votes where VoteTypeId = 3 and PostId = %s"
upCount = cursor.execute(sqlVup, (id))
up = cursor.fetchone()
downCount = cursor.execute(sqlVdown, (id))
down = cursor.fetchone()
row[u'CreationDate'] = str(row[u'CreationDate'])
row[u'UpVotes'] = up[u'count']
row[u'DownVotes'] = down[u'count']
row[u'ViewCountRank'] = getRange(v1, v2, v3, row[u'ViewCount'])
row[u'UsefulnessRank'] = getRange(u1, u2, u3, row[u'Usefulness'])
data = questionIds
return data
except:
print traceback.print_exc()
return -1
def searchQuery(query, resultCount):
try:
data = []
# Remove stopwords from query and lemmatize the words
global postIds;
postIds = []
query = stopWordsAndLemmatize(query)
# Return "size" hits for the given query. Size arbitrarily set to 50
matches = es.search(index = "posts_index", q = query, size = resultCount)
hits = matches['hits']['hits']
for hit in hits:
postIds.append(str(hit['_id']))
data = fetchResults(1)
return data
except:
print traceback.print_exc()
return -1
if __name__ == "__main__":
# Run this file initially to create index for Posts table.
#indexPosts()
# Sample query
qIdList = searchQuery("constructors", 2)
print qIdList
qIdList = searchQuery("abstract class", 2)
print qIdList