|
| 1 | +import re |
| 2 | +import copy |
| 3 | +from shapely.geometry import LineString |
| 4 | + |
| 5 | + |
| 6 | +class CoreDateParser: |
| 7 | + def __init__(self, formats, start_year=1900, end_year=2100): |
| 8 | + self.formats = formats |
| 9 | + self.start_year = start_year |
| 10 | + self.end_year = end_year |
| 11 | + self.values = self.get_values() |
| 12 | + self.patterns = self.get_patterns() |
| 13 | + |
| 14 | + def get_patterns(self): |
| 15 | + final_patterns = {} |
| 16 | + for fmat in self.formats: |
| 17 | + patterns = [] |
| 18 | + present_sub_formats = [] |
| 19 | + for v in self.values: |
| 20 | + if v in fmat: |
| 21 | + present_sub_formats.append(v) |
| 22 | + i = 0 |
| 23 | + while(i < len(present_sub_formats)): |
| 24 | + v = present_sub_formats[i] |
| 25 | + if len(patterns) == 0: |
| 26 | + for val in self.values[v]: |
| 27 | + patterns.append(fmat.replace(v, val)) |
| 28 | + else: |
| 29 | + new_patterns = [] |
| 30 | + for p in patterns: |
| 31 | + for val in self.values[v]: |
| 32 | + new_patterns.append(p.replace(v, val)) |
| 33 | + patterns = copy.deepcopy(new_patterns) |
| 34 | + i += 1 |
| 35 | + final_patterns[fmat] = {k: None for k in sorted(patterns)} |
| 36 | + return final_patterns |
| 37 | + |
| 38 | + def get_values(self): |
| 39 | + values = { |
| 40 | + "%b": ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"], |
| 41 | + "%d": self.get_dates(n_digits=2), |
| 42 | + "%-d": self.get_dates(), |
| 43 | + "%Y": self.get_year_value(start_year=self.start_year, end_year=self.end_year), |
| 44 | + "%B": ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"], |
| 45 | + "%m": self.get_months(), |
| 46 | + "%-m": self.get_months(n_digits=1), |
| 47 | + "%y": self.get_year_value(start_year=self.start_year, end_year=self.end_year, n_digits=2) |
| 48 | + } |
| 49 | + return values |
| 50 | + |
| 51 | + def get_year_value(self, start_year=1900, end_year=2100, n_digits=4): |
| 52 | + if n_digits == 4: |
| 53 | + return [str(d) for d in range(start_year, end_year+1)] |
| 54 | + else: |
| 55 | + if end_year - start_year >= 100: |
| 56 | + return ["{:02d}".format(d) for d in range(0, 100)] |
| 57 | + else: |
| 58 | + return ["{:02d}".format(int(str(d)[2:])) for d in range(start_year, end_year+1)] |
| 59 | + |
| 60 | + def get_dates(self, n_digits=1): |
| 61 | + if n_digits == 1: |
| 62 | + return [str(d) for d in range(1, 32)] |
| 63 | + else: |
| 64 | + return ["{:02d}".format(d) for d in range(1, 32)] |
| 65 | + |
| 66 | + def get_months(self, n_digits=2): |
| 67 | + if n_digits == 2: |
| 68 | + return ["{:02d}".format(d) for d in range(1, 13)] |
| 69 | + else: |
| 70 | + return [str(d) for d in range(1, 13)] |
| 71 | + |
| 72 | + def find_repeat_matches(self, query_string, sub_string, pattern): |
| 73 | + qs = copy.deepcopy(query_string) |
| 74 | + ret_list = [] |
| 75 | + while True: |
| 76 | + orig_string = copy.deepcopy(qs) |
| 77 | + flag = False |
| 78 | + if sub_string in orig_string: |
| 79 | + ret_list.append((sub_string, orig_string.index( |
| 80 | + sub_string), orig_string.index(sub_string) + len(sub_string), pattern)) |
| 81 | + qs = copy.deepcopy(orig_string[:orig_string.index(sub_string)] + " "*len( |
| 82 | + sub_string) + orig_string[orig_string.index(sub_string) + len(sub_string):]) |
| 83 | + flag = True |
| 84 | + if not flag: |
| 85 | + break |
| 86 | + return ret_list |
| 87 | + |
| 88 | + def parse_string(self, query_string): |
| 89 | + query_string = query_string.lower() |
| 90 | + matches = {} |
| 91 | + for k in self.patterns: |
| 92 | + for v in self.patterns[k]: |
| 93 | + if v in query_string: |
| 94 | + if k not in matches: |
| 95 | + matches[k] = [] |
| 96 | + matches[k] += self.find_repeat_matches(query_string, v, k) |
| 97 | + priority_matches = self.priority_matches(matches) |
| 98 | + if len(priority_matches.keys()) > 0: |
| 99 | + token_spans = self.get_token_spans(query_string) |
| 100 | + match_tokens = self.get_match_tokens(priority_matches, token_spans) |
| 101 | + return match_tokens |
| 102 | + return None |
| 103 | + |
| 104 | + def get_match_tokens(self, priority_matches, token_spans): |
| 105 | + ret_list = [] |
| 106 | + for key in priority_matches: |
| 107 | + for pm in priority_matches[key]: |
| 108 | + char_start = pm[1] |
| 109 | + char_end = pm[2] |
| 110 | + start_token = 1000 |
| 111 | + end_token = -1 |
| 112 | + for idx in token_spans: |
| 113 | + ls1 = LineString([(char_start, 0), (char_end, 0)]) |
| 114 | + ls2 = LineString( |
| 115 | + [(token_spans[idx][1], 0), (token_spans[idx][2], 0)]) |
| 116 | + if ls1.intersects(ls2): |
| 117 | + start_token = min(start_token, token_spans[idx][3]) |
| 118 | + end_token = max(end_token, token_spans[idx][3]) |
| 119 | + ret_list.append(list(pm) + [start_token, end_token]) |
| 120 | + ret_list = sorted(ret_list, key=lambda x: x[2]-x[1], reverse=True) |
| 121 | + final_ret_list = [] |
| 122 | + for rl in ret_list: |
| 123 | + flag = False |
| 124 | + for frl in final_ret_list: |
| 125 | + if (frl[1] <= rl[1] and frl[2] > rl[2]) or (frl[1] < rl[1] and frl[2] >= rl[2]): |
| 126 | + flag = True |
| 127 | + break |
| 128 | + if not flag: |
| 129 | + final_ret_list.append(rl) |
| 130 | + return final_ret_list |
| 131 | + |
| 132 | + def priority_matches(self, matches): |
| 133 | + unique_found_formats = [] |
| 134 | + found_formats = matches.keys() |
| 135 | + found_formats = sorted( |
| 136 | + found_formats, key=lambda x: len(x), reverse=True) |
| 137 | + for f in found_formats: |
| 138 | + flag = False |
| 139 | + for uf in unique_found_formats: |
| 140 | + if f in uf: |
| 141 | + flag = True |
| 142 | + break |
| 143 | + if not flag: |
| 144 | + unique_found_formats.append(f) |
| 145 | + return {k: matches[k] for k in unique_found_formats} |
| 146 | + |
| 147 | + def get_token_spans(self, query_string): |
| 148 | + query_string = query_string.lower() |
| 149 | + tokens = query_string.split() |
| 150 | + ret_obj = {} |
| 151 | + for idx, k in enumerate(tokens): |
| 152 | + ret_obj[idx] = (k, query_string.index( |
| 153 | + k), query_string.index(k) + len(k), idx) |
| 154 | + query_string = query_string[:query_string.index( |
| 155 | + k)] + " "*len(k) + query_string[query_string.index(k) + len(k):] |
| 156 | + return ret_obj |
0 commit comments