-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatch-algo.py
78 lines (41 loc) · 1.96 KB
/
match-algo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import pprint
consignee = pd.read_csv("/home/intrior/projects/trial/consignee.csv", na_values="NA")
# consignee = pd.read_csv("/home/intrior/projects/trial/consignee.csv", na_values="NA")
consignee_names_series = consignee['CONSINEE']
consignee_names_series = consignee_names_series.str.strip()
# print consignee_names_series
#create this regex instread of hardcoding it
# regex_pattern = re.compile(r'.*censea.*', flags=re.IGNORECASE)
#take user input
text_to_search = raw_input('Enter consignee name to search: ')
text_to_search = text_to_search.strip()
# text_to_search = 'india'
# regex_pattern = r".*" + re.escape(text_to_search) + r".*"
regex_pattern = r".* ?" + re.escape(text_to_search) + r".*"
results = consignee_names_series[consignee_names_series.str.findall(regex_pattern, flags=re.IGNORECASE).str.len() > 0]
# results = consignee_names_series.str.findall(r'.*censea.*', flags=re.IGNORECASE)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(dict([(text_to_search,dict([('matches', results.drop_duplicates().values.tolist()), ('total', len(results))]))]))
#if you get list of consignee as input
#convert it to series
#extract the most recurring pattern from list of consigness and return it as key
# print results
# print len(results)
# sliced_results = results.str.slice()
# print sliced_results
#this what gives results as we want
# y[y.str.findall(r'.*censea.*', flags=re.IGNORECASE).str.len() > 0]
# this works from csv data
# y.str.findall(r'.*censea.*', flags=re.IGNORECASE)
# works for find all
# regex_pat = re.compile(r'.*censea.*', flags=re.IGNORECASE)
# results=y.str.findall(regex_pat)
# gives string s as non list
# slie_series = results.str.slice()
# returns just list of all censea only
# regex_pat = re.compile(r'censea', flags=re.IGNORECASE)
# only beginning with censea
# regex_pat = re.compile(r'^censea', flags=re.IGNORECASE)