3333demoji .download_codes ()
3434from nltk .tokenize import TweetTokenizer
3535
36- #gobal
36+ # Global
37+
3738PunctChars = r'''[`'“".?!,:;]'''
3839Punct = '%s+' % PunctChars
3940Entity = '&(amp|lt|gt|quot);'
4041printable = set (string .printable )
4142
42- # helper functoins
43+ # Helper functoins.
44+
4345def regex_or (* items ):
4446 r = '|' .join (items )
4547 r = '(' + r + ')'
@@ -55,6 +57,7 @@ def optional(r):
5557 return '(%s)?' % r
5658
5759def trim (transient_tweet_text ):
60+
5861 '''
5962 trim leading and trailing spaces in the tweet text
6063 '''
@@ -112,7 +115,8 @@ def process_URLs(transient_tweet_text):
112115 Url_RE = re .compile ("(%s)" % Url , re .U | re .I )
113116 transient_tweet_text = re .sub (Url_RE , " constanturl " , transient_tweet_text )
114117
115- # fix to handle unicodes in URL
118+ # Fix to handle unicodes in URL.
119+
116120 URL_regex2 = r'\b(htt)[p\:\/]*([\\x\\u][a-z0-9]*)*'
117121 transient_tweet_text = re .sub (URL_regex2 , " constanturl " , transient_tweet_text )
118122 return transient_tweet_text
@@ -155,9 +159,9 @@ def process_Dates(transient_tweet_text):
155159 '''
156160 Identify date and convert it to constant
157161 '''
158- #transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
159- #transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
160- #date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
162+ # transient_tweet_text = re.sub(r'(\d+/\d+/\d+)', " constantdate " , transient_tweet_text)
163+ # transient_tweet_text = re.sub(r'constantnum[\s]?(/|-)[\s]?constantnum[\s]?(/|-)[\s]?constantnum', " constantdate " , transient_tweet_text)
164+ # date_regex = r'(constantnum)[\s]*(st|nd|rd|th)[\s]*(january|jan|february|feb|march|mar|april|may|june|jun|july|august|aug|september|sep|october|oct|november|nov|december|dec)'
161165 date_regex1 = r'\b((0|1|2|3)?[0-9][\s]*)[-./]([\s]*([012]?[0-9])[\s]*)([-./]([\s]*(19|20)[0-9][0-9]))?\b'
162166 transient_tweet_text = re .sub (date_regex1 , ' constantdate ' , transient_tweet_text )
163167 date_regex2 = r'\b((19|20)[0-9][0-9][\s]*[-./]?)?[\s]*([012]?[0-9])[\s]*[-./][\s]*(0|1|2|3)?[0-9]\b'
@@ -221,37 +225,37 @@ def identify_Savings(transient_tweet_text):
221225 '''
222226 identify sale/save offers
223227 '''
224- #sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
228+ # sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*(constantnum)*[\s]*[%]?[\s]*(-|~)?[\s]*(constantnum)*[\s]*[%]?'
225229 sale_regex = r'(?<!#)\b(discount|discounts|sale|save|saver|super[\s]*saver|super[\s]*save)\b[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(-|~|or)?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?'
226230 transient_tweet_text = re .sub (sale_regex , " constantdiscount " , transient_tweet_text )
227- #discount_List = []
228- #discount_List = re.findall(r'constantdiscount', transient_tweet_text)
231+ # discount_List = []
232+ # discount_List = re.findall(r'constantdiscount', transient_tweet_text)
229233 return transient_tweet_text
230234
231235def indentify_Offers (transient_tweet_text ):
232236 '''
233237 identify cashbacks and off / substrings of the form "30% off" or "30% cashback" or "$30 off"
234238 Replace them by constantOFFER
235239 '''
236- #transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
240+ # transient_tweet_text = re.sub(r'[rs|$]?[ ]*[constantnum][ ]*[%]?[ ]?[off|cashback|offer]', "constantoffer", transient_tweet_text)
237241 transient_tweet_text = re .sub (r'(?<!#)\b(?:(up[\s]?to)?((rs|\$)*[\s]*(constantnum))[\s]*[%]?)?[\s]*[-|~|or]?[\.]?[\s]*((rs|\$)*[\s]*(constantnum))*[\s]*[%]?[\s]*(offer|off|cashback|cash|cash back)' , " constantoffer " , transient_tweet_text )
238242 transient_tweet_text = re .sub (r'(?<!#)\b(?:cashback|cash back|cash)\b' , " constantoffer " , transient_tweet_text )
239- #Offer_List = []
240- #Offer_List = re.findall(r'constantoffer', transient_tweet_text)
243+ # Offer_List = []
244+ # Offer_List = re.findall(r'constantoffer', transient_tweet_text)
241245 return transient_tweet_text
242246
243247def indentify_Promos (transient_tweet_text ):
244248 '''
245249 indentify coupons/promos with promo codes
246250 Assumption - promo code can be alphanumeric. But it immediately follows text of promo/code/promocode etc
247251 '''
248- #transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
252+ # transient_tweet_text = re.sub(r'\b(promocode|promo code|promo|code)[s]?[\s]*[a-z]*(constantnum)*[a-z]*[\s]+', " constantpromo ", transient_tweet_text)
249253 transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*(constantalphanum)\b' , " constantpromo " , transient_tweet_text )
250254 transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[a-z]+\b' , " constantpromo " , transient_tweet_text )
251255 transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code)\b[\s]*[0-9]+\b' , " constantpromo " , transient_tweet_text )
252256 transient_tweet_text = re .sub (r'(?<!#)\b(?:promocode|promo code|promo|coupon code|code|coupon)[s]?\b' , " constantpromo " , transient_tweet_text )
253- #Promo_List = []
254- #Promo_List = re.findall(r'constantpromo', transient_tweet_text)
257+ # Promo_List = []
258+ # Promo_List = re.findall(r'constantpromo', transient_tweet_text)
255259 return transient_tweet_text
256260
257261def indentify_Money (transient_tweet_text ):
@@ -264,8 +268,8 @@ def indentify_Money(transient_tweet_text):
264268 transient_tweet_text = re .sub (money_regex2 , " constantmoney " , transient_tweet_text )
265269 money_regex3 = r'(\$|rs)[\s]*constantalphanum'
266270 transient_tweet_text = re .sub (money_regex3 , " constantmoney " , transient_tweet_text )
267- #Money_List = []
268- #Money_List = re.findall(r'constantmoney', transient_tweet_text)
271+ # Money_List = []
272+ # Money_List = re.findall(r'constantmoney', transient_tweet_text)
269273 return transient_tweet_text
270274
271275def indentify_freebies (transient_tweet_text ):
@@ -360,9 +364,6 @@ def deEmojify(transient_tweet_text):
360364# ############
361365# print_test()
362366
363-
364-
365-
366367def process_TweetText (tweet_text ):
367368 '''
368369 Takes tweet_text and preprocesses it
@@ -372,44 +373,50 @@ def process_TweetText(tweet_text):
372373 '''
373374
374375 # get utf-8 encoding, lowercase, trim and remove multiple white spaces
376+
375377 transient_tweet_text = tweet_text
376378 transient_tweet_text = strip_unicode (transient_tweet_text )
377- #print "PROCESSED: ", transient_tweet_text
379+
380+ # print "PROCESSED: ", transient_tweet_text
378381
379382 transient_tweet_text = to_LowerCase (transient_tweet_text )
380383 transient_tweet_text = trim (transient_tweet_text )
381384 transient_tweet_text = strip_whiteSpaces (transient_tweet_text )
382385 transient_tweet_text = remove_spl_words (transient_tweet_text )
383386
384-
385- #emoji
387+ # Emoji
388+
386389 transient_tweet_text = remove_emoji (transient_tweet_text )
387390 transient_tweet_text = deEmojify (transient_tweet_text )
388- # process Hastags, URLs, Websites, process_EmailIds
391+
392+ # Process Hastags, URLs, Websites, process_EmailIds
389393 # Give precedence to url over hashtag
394+
390395 transient_tweet_text = process_URLs (transient_tweet_text )
391396 transient_tweet_text = process_HashTags (transient_tweet_text )
392- #transient_tweet_text = process_Websites(transient_tweet_text)
397+
398+ # transient_tweet_text = process_Websites(transient_tweet_text)
399+
393400 transient_tweet_text = process_EmailIds (transient_tweet_text )
394401
395- # process for brand mention, any other mention and brand Name
396- #transient_tweet_text = process_BrandMentions(transient_tweet_text)
397- #transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
402+ # Process for brand mention, any other mention and brand Name
403+ # transient_tweet_text = process_BrandMentions(transient_tweet_text)
404+ # transient_tweet_text = process_NonBrandMentions(transient_tweet_text)
398405 transient_tweet_text = process_Mentions (transient_tweet_text )
399406 #transient_tweet_text = process_BrandName(transient_tweet_text)
400407
401- # remove any unicodes
408+ # Remove any unicodes
402409 transient_tweet_text = strip_unicode (transient_tweet_text )
403410
404- # identify Date / Time if any
411+ # Identify Date / Time if any
405412 transient_tweet_text = process_Times (transient_tweet_text )
406413 transient_tweet_text = process_Dates (transient_tweet_text )
407414
408- # indentify alphanums and nums
415+ # Identify alphanums and nums
409416 transient_tweet_text = identify_AlphaNumerics (transient_tweet_text )
410417 transient_tweet_text = replace_numbers (transient_tweet_text )
411418
412- # identify promos, savings, offers, money and freebies
419+ # Identify promos, savings, offers, money and freebies
413420 transient_tweet_text = indentify_Promos (transient_tweet_text )
414421 transient_tweet_text = identify_Savings (transient_tweet_text )
415422 transient_tweet_text = indentify_Offers (transient_tweet_text )
@@ -424,4 +431,4 @@ def process_TweetText(tweet_text):
424431 return transient_tweet_text
425432
426433# if __name__ == "__main__":
427- # print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))
434+ # print(process_TweetText("Nice @varun paytm @paytm saver abc@gmail.com sizes for the wolf on 20/10/2010 at 10:00PM grey/deep royal-volt Nike Air Skylon II retro are 40% OFF for a limited time at $59.99 + FREE shipping.BUY HERE -> https://bit.ly/2L2n7rB (promotion - use code MEMDAYSV at checkout)"))
0 commit comments