JavaWithTheFlow/search.py at master · aartirao/JavaWithTheFlow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import pymysql.cursors
import datetime
import traceback
import json
from elasticsearch import Elasticsearch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from posts import getRange

es = Elasticsearch()

import ConfigParser

config = ConfigParser.ConfigParser()
config.read('db.cfg')

connection = pymysql.connect(host=config.get('database','host'),
                             user=config.get('database','username'),
                             password=config.get('database','password'),
                             db = config.get('database','db'),
                             charset = 'utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

postIds = []

javaKeywords = """abstract  continue  for  new  switch
assert  default  goto  package  synchronized
boolean  do  if  private  this
break  double  implements  protected  throw
byte  else  import  public  throws
case  enum  instanceof  return  transient
catch  extends  int  short  try
char  final  interface  static  void
class  finally  long  strictfp  volatile
const*  float  native  super  while""".split()

# Method to remove stopwords and lemmatize the words in given text.
def stopWordsAndLemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    stopWordsSet = set(stopwords.words('english'))
    # Updated punctuations because punctuations could be important for code.
    stopWordsSet.update(set(['.', ',', '"', "'", '?', '!',
                ':', ';', '(', ')', '[', ']', '{', '}']))
    stopWordsSet = stopWordsSet - set(javaKeywords)
    newText = " ".join([wordnet_lemmatizer.lemmatize(word.lower())
        for word in text.split()
        if word.lower() not in stopWordsSet])
    return newText

#Method to index Posts table based on Body, Title and Tag fields.
def indexPosts():
    try:
        with connection.cursor() as cursor:
            query = "select Id, PostTypeId, Body, Title, Tags from `Posts`"
            cursor.execute(query)
            c = 1
            for row in cursor:
                print "Row: ", c
                c += 1
                for key in row:
                    row[str(key)] = str(row.pop(key))
                # Remove stopwords and lemmatize words in Body and Title
                for key in ('Body', 'Title'):
                    row[key] = stopWordsAndLemmatize(row[key])
                # The ID of each index is the ID of the post in the Posts table
                docId = row.pop('Id')
                es.index(index="posts_index", doc_type="posts_table",
                        id = docId, body=row)

    except Exception, e:
        print traceback.print_exc()

def fetchResults(pageId):
	questionIds = []
	data = []
	endIndex = int(pageId) * 10
	startIndex = endIndex - 10
	try:
		with connection.cursor() as cursor:
			# If the ID belongs to a question, add to the result as is. If it
			# belongs to an answer, find its question ID and add to the result.
			for postId in postIds[startIndex:endIndex]:
				sqlPostId = "SELECT P.Id, P.Title, P.ViewCount, P.Usefulness, P.OwnerUserId, P.OwnerDisplayName, P.FavouriteCount, P.Tags, \
			P.AnswerCount, P.CreationDate, P.PostTypeId from Posts as P where P.Id = %s"
				rowCount = cursor.execute(sqlPostId, postId)
				if rowCount > 0:
					postTypes = cursor.fetchall()
					for row in postTypes:
						if row['PostTypeId'] == 1:
							questionIds.append(row)
						elif row['PostTypeId'] == 2:
							sqlParent = "SELECT P.Id, P.Title, P.ViewCount, P.Usefulness, P.OwnerUserId, P.OwnerDisplayName, P.FavouriteCount, P.Tags, \
			P.AnswerCount, P.CreationDate, P.PostTypeId from Posts as P where P.Id in (SELECT `ParentId` from `Posts` where `Id` = %s) LIMIT 1"
							parentRowCount = cursor.execute(sqlParent, postId)
							if parentRowCount > 0:
								parents = cursor.fetchall()
								for record in parents:
									# The parent ID of this answer could already be in
									# the list of hits, if the question itself was
									# a hit for this query. So skip if present in list.
									if record not in questionIds:
										questionIds.append(record)

			viewCounts = []
			usefulnessCounts = []
			for row in questionIds:
				viewCounts.append(int(row[u'ViewCount']))
				usefulnessCounts.append(int(row[u'Usefulness']))
			viewCounts.sort()
			usefulnessCounts.sort()
			splitAt = len(questionIds) / 3
			v1 = viewCounts[:splitAt]
			v2 = viewCounts[splitAt:splitAt*2]
			v3 = viewCounts[splitAt*2:]
			u1 = usefulnessCounts[:splitAt]
			u2 = usefulnessCounts[splitAt:splitAt*2]
			u3 = usefulnessCounts[splitAt*2:]

			for row in questionIds:
				id = row[u'Id']
				sqlVup = "SELECT count(Id) as count from Votes where VoteTypeId = 2 and PostId = %s"
				sqlVdown = "SELECT count(Id) as count from Votes where VoteTypeId = 3 and PostId = %s"
				upCount = cursor.execute(sqlVup, (id))
				up = cursor.fetchone()
				downCount = cursor.execute(sqlVdown, (id))
				down = cursor.fetchone()
				row[u'CreationDate'] = str(row[u'CreationDate'])
				row[u'UpVotes'] = up[u'count']
				row[u'DownVotes'] = down[u'count']
				row[u'ViewCountRank'] = getRange(v1, v2, v3, row[u'ViewCount'])
				row[u'UsefulnessRank'] = getRange(u1, u2, u3, row[u'Usefulness'])
			data = questionIds

		return data
	except:
		print traceback.print_exc()
		return -1

def searchQuery(query, resultCount):
    try:
        data = []
        # Remove stopwords from query and lemmatize the words
        global postIds;
        postIds = []
        query = stopWordsAndLemmatize(query)

        # Return "size" hits for the given query. Size arbitrarily set to 50
        matches = es.search(index = "posts_index", q = query, size = resultCount)
        hits = matches['hits']['hits']
        for hit in hits:
            postIds.append(str(hit['_id']))
        data = fetchResults(1)
        return data
    except:
        print traceback.print_exc()
        return -1

if __name__ == "__main__":
    # Run this file initially to create index for Posts table.
    #indexPosts()
    # Sample query
    qIdList = searchQuery("constructors", 2)
    print qIdList
    qIdList = searchQuery("abstract class", 2)
    print qIdList