Skip to content

Commit 637003a

Browse files
committed
update errors, validators and core parser
1 parent fabdd7b commit 637003a

File tree

5 files changed

+184
-0
lines changed

5 files changed

+184
-0
lines changed

pydateparser/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
__version__ = '0.1.0'
2+
3+
from .date_parser import DateParser

pydateparser/_core_date_parser.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
import re
2+
import copy
3+
from shapely.geometry import LineString
4+
5+
6+
class CoreDateParser:
7+
def __init__(self, formats, start_year=1900, end_year=2100):
8+
self.formats = formats
9+
self.start_year = start_year
10+
self.end_year = end_year
11+
self.values = self.get_values()
12+
self.patterns = self.get_patterns()
13+
14+
def get_patterns(self):
15+
final_patterns = {}
16+
for fmat in self.formats:
17+
patterns = []
18+
present_sub_formats = []
19+
for v in self.values:
20+
if v in fmat:
21+
present_sub_formats.append(v)
22+
i = 0
23+
while(i < len(present_sub_formats)):
24+
v = present_sub_formats[i]
25+
if len(patterns) == 0:
26+
for val in self.values[v]:
27+
patterns.append(fmat.replace(v, val))
28+
else:
29+
new_patterns = []
30+
for p in patterns:
31+
for val in self.values[v]:
32+
new_patterns.append(p.replace(v, val))
33+
patterns = copy.deepcopy(new_patterns)
34+
i += 1
35+
final_patterns[fmat] = {k: None for k in sorted(patterns)}
36+
return final_patterns
37+
38+
def get_values(self):
39+
values = {
40+
"%b": ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"],
41+
"%d": self.get_dates(n_digits=2),
42+
"%-d": self.get_dates(),
43+
"%Y": self.get_year_value(start_year=self.start_year, end_year=self.end_year),
44+
"%B": ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"],
45+
"%m": self.get_months(),
46+
"%-m": self.get_months(n_digits=1),
47+
"%y": self.get_year_value(start_year=self.start_year, end_year=self.end_year, n_digits=2)
48+
}
49+
return values
50+
51+
def get_year_value(self, start_year=1900, end_year=2100, n_digits=4):
52+
if n_digits == 4:
53+
return [str(d) for d in range(start_year, end_year+1)]
54+
else:
55+
if end_year - start_year >= 100:
56+
return ["{:02d}".format(d) for d in range(0, 100)]
57+
else:
58+
return ["{:02d}".format(int(str(d)[2:])) for d in range(start_year, end_year+1)]
59+
60+
def get_dates(self, n_digits=1):
61+
if n_digits == 1:
62+
return [str(d) for d in range(1, 32)]
63+
else:
64+
return ["{:02d}".format(d) for d in range(1, 32)]
65+
66+
def get_months(self, n_digits=2):
67+
if n_digits == 2:
68+
return ["{:02d}".format(d) for d in range(1, 13)]
69+
else:
70+
return [str(d) for d in range(1, 13)]
71+
72+
def find_repeat_matches(self, query_string, sub_string, pattern):
73+
qs = copy.deepcopy(query_string)
74+
ret_list = []
75+
while True:
76+
orig_string = copy.deepcopy(qs)
77+
flag = False
78+
if sub_string in orig_string:
79+
ret_list.append((sub_string, orig_string.index(
80+
sub_string), orig_string.index(sub_string) + len(sub_string), pattern))
81+
qs = copy.deepcopy(orig_string[:orig_string.index(sub_string)] + " "*len(
82+
sub_string) + orig_string[orig_string.index(sub_string) + len(sub_string):])
83+
flag = True
84+
if not flag:
85+
break
86+
return ret_list
87+
88+
def parse_string(self, query_string):
89+
query_string = query_string.lower()
90+
matches = {}
91+
for k in self.patterns:
92+
for v in self.patterns[k]:
93+
if v in query_string:
94+
if k not in matches:
95+
matches[k] = []
96+
matches[k] += self.find_repeat_matches(query_string, v, k)
97+
priority_matches = self.priority_matches(matches)
98+
if len(priority_matches.keys()) > 0:
99+
token_spans = self.get_token_spans(query_string)
100+
match_tokens = self.get_match_tokens(priority_matches, token_spans)
101+
return match_tokens
102+
return None
103+
104+
def get_match_tokens(self, priority_matches, token_spans):
105+
ret_list = []
106+
for key in priority_matches:
107+
for pm in priority_matches[key]:
108+
char_start = pm[1]
109+
char_end = pm[2]
110+
start_token = 1000
111+
end_token = -1
112+
for idx in token_spans:
113+
ls1 = LineString([(char_start, 0), (char_end, 0)])
114+
ls2 = LineString(
115+
[(token_spans[idx][1], 0), (token_spans[idx][2], 0)])
116+
if ls1.intersects(ls2):
117+
start_token = min(start_token, token_spans[idx][3])
118+
end_token = max(end_token, token_spans[idx][3])
119+
ret_list.append(list(pm) + [start_token, end_token])
120+
ret_list = sorted(ret_list, key=lambda x: x[2]-x[1], reverse=True)
121+
final_ret_list = []
122+
for rl in ret_list:
123+
flag = False
124+
for frl in final_ret_list:
125+
if (frl[1] <= rl[1] and frl[2] > rl[2]) or (frl[1] < rl[1] and frl[2] >= rl[2]):
126+
flag = True
127+
break
128+
if not flag:
129+
final_ret_list.append(rl)
130+
return final_ret_list
131+
132+
def priority_matches(self, matches):
133+
unique_found_formats = []
134+
found_formats = matches.keys()
135+
found_formats = sorted(
136+
found_formats, key=lambda x: len(x), reverse=True)
137+
for f in found_formats:
138+
flag = False
139+
for uf in unique_found_formats:
140+
if f in uf:
141+
flag = True
142+
break
143+
if not flag:
144+
unique_found_formats.append(f)
145+
return {k: matches[k] for k in unique_found_formats}
146+
147+
def get_token_spans(self, query_string):
148+
query_string = query_string.lower()
149+
tokens = query_string.split()
150+
ret_obj = {}
151+
for idx, k in enumerate(tokens):
152+
ret_obj[idx] = (k, query_string.index(
153+
k), query_string.index(k) + len(k), idx)
154+
query_string = query_string[:query_string.index(
155+
k)] + " "*len(k) + query_string[query_string.index(k) + len(k):]
156+
return ret_obj

pydateparser/_errors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class DateParserException(Exception):
2+
pass

pydateparser/_loggers.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
"""Singleton logger for all modules to use."""
2+
import logging
3+
4+
formatter = logging.Formatter(
5+
'DATEPARSER: %(asctime)s %(levelname)s %(message)s')
6+
file_handler = logging.FileHandler('dateparser.log')
7+
file_handler.setFormatter(formatter)
8+
stdout_handler = logging.StreamHandler()
9+
stdout_handler.setFormatter(formatter)
10+
11+
logger = logging.Logger('dateparser', level=logging.INFO)
12+
logger.addHandler(file_handler)
13+
logger.addHandler(stdout_handler)

pydateparser/_validators.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
def _date_format_type_validator(instance, attribute, value):
2+
if value != None and not isinstance(value, list) and not isinstance(value, str):
3+
raise ValueError(
4+
"date_format attribute can be of type 'list' or 'None' or 'str'.")
5+
6+
7+
def _end_year_validator(instance, attribute, value):
8+
if value < instance.start_year:
9+
raise ValueError(
10+
"'end_year' has to be greater than or equal to 'start_year'!")

0 commit comments

Comments
 (0)