Skip to content

Commit 136abbf

Browse files
committed
small fixes and additional name matching (and dep updates)
Signed-off-by: John Seekins <john@robot-house.us>
1 parent f1e6818 commit 136abbf

File tree

11 files changed

+341
-165
lines changed

11 files changed

+341
-165
lines changed

enrichers/general.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,23 @@ def _enrich_facility(facility_data: tuple) -> tuple:
4343
wd_res = wikidata.Wikidata(facility_name=facility_name).search()
4444
osm = openstreetmap.OpenStreetMap(facility_name=facility_name, address=facility.get("address", {}))
4545
osm_res = osm.search()
46-
enriched_facility["wikipedia"]["page_url"] = wiki_res.get("url", "")
46+
url = wiki_res.get("url", None)
47+
if url:
48+
enriched_facility["wikipedia"]["page_url"] = url
4749
enriched_facility["wikipedia"]["search_query"] = wiki_res.get("search_query_steps", "")
48-
enriched_facility["wikidata"]["page_url"] = wd_res.get("url", "")
50+
url = wd_res.get("url", None)
51+
if url:
52+
enriched_facility["wikidata"]["page_url"] = url
4953
enriched_facility["wikidata"]["search_query"] = wd_res.get("search_query_steps", "")
50-
enriched_facility["osm"]["latitude"] = osm_res.get("details", {}).get("latitude", osm.default_coords["latitude"])
51-
enriched_facility["osm"]["longitude"] = osm_res.get("details", {}).get("longitude", osm.default_coords["longitude"])
52-
enriched_facility["osm"]["url"] = osm_res.get("url", "")
54+
lat = osm_res.get("details", {}).get("latitude", None)
55+
long = osm_res.get("details", {}).get("longitude", None)
56+
if lat:
57+
enriched_facility["osm"]["latitude"] = lat
58+
if long:
59+
enriched_facility["osm"]["longitude"] = lat
60+
url = osm_res.get("url", None)
61+
if url:
62+
enriched_facility["osm"]["url"] = url
5363
enriched_facility["osm"]["search_query"] = osm_res.get("search_query_steps", "")
5464

5565
logger.debug(enriched_facility)

file_utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@ def export_to_file(
2323
match file_type:
2424
case "xlsx":
2525
with xlsxwriter.Workbook(full_name, {"remove_timezone": True}) as wb:
26-
writer.write_excel(workbook=wb, include_header=True, autofit=True)
26+
_ = writer.write_excel(workbook=wb, include_header=True, autofit=True)
2727
case "csv":
2828
with open(full_name, "w", newline="", encoding="utf-8") as f_out:
2929
writer.write_csv(file=f_out, include_header=True)
3030
case "parquet":
3131
writer.write_parquet(full_name, use_pyarrow=True)
32+
case _:
33+
logger.warning("Invalid dataframe output type %s", file_type)
3234
elif file_type == "json":
3335
with open(full_name, "w", encoding="utf-8") as f_out:
3436
json.dump(facilities_data, f_out, indent=2, sort_keys=True, default=str)
@@ -103,7 +105,7 @@ def print_summary(facilities_data: dict) -> None:
103105
false_positives = 0
104106
errors = 0
105107
for facility in facilities_data["facilities"].values():
106-
query = facility.get("wikipedia", {}).get("search_query", "")
108+
query: str = facility.get("wikipedia", {}).get("search_query", "")
107109
if "REJECTED" in query:
108110
false_positives += 1
109111
elif "ERROR" in query:

ice_scrapers/__init__.py

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -4,67 +4,75 @@
44
may call them
55
"""
66

7-
# extracted ADP sheet header list 2025-09-07
8-
facility_sheet_header = [
9-
"Name",
10-
"Address",
11-
"City",
12-
"State",
13-
"Zip",
14-
"AOR",
15-
"Type Detailed",
16-
"Male/Female",
17-
"FY25 ALOS",
18-
"Level A",
19-
"Level B",
20-
"Level C",
21-
"Level D",
22-
"Male Crim",
23-
"Male Non-Crim",
24-
"Female Crim",
25-
"Female Non-Crim",
26-
"ICE Threat Level 1",
27-
"ICE Threat Level 2",
28-
"ICE Threat Level 3",
29-
"No ICE Threat Level",
30-
"Mandatory",
31-
"Guaranteed Minimum",
32-
"Last Inspection Type",
33-
"Last Inspection End Date",
34-
"Pending FY25 Inspection",
35-
"Last Inspection Standard",
36-
"Last Final Rating",
37-
]
38-
397
ice_inspection_types = {
408
# found in https://www.ice.gov/foia/odo-facility-inspections
419
"ODO": "Office of Detention Oversight",
4210
# found in https://ia803100.us.archive.org/16/items/6213032-ORSA-MOU-ICE/6213032-ORSA-MOU-ICE_text.pdf
4311
"ORSA": "Operational Review Self-Assessment",
4412
}
4513

14+
# extracted from https://vera-institute.files.svdcdn.com/production/downloads/dashboard_appendix.pdf 2025-09-23
15+
ice_facility_group_mapping = {
16+
"Non-Dedicated": ["IGSA"],
17+
"Dedicated": ["DIGSA", "CDF", "SPC"],
18+
"Federal": ["BOF", "USMSIGA", "USMS IGA", "USMS CDF", "DOD", "MOC"],
19+
"Hold/Staging": ["Hold", "Staging"],
20+
"Family/Youth": ["Family", "Juvenile", "FAMILY"],
21+
"Medical": ["Hospital"],
22+
"Hotel": ["Hotel"],
23+
"Other/Unknown": ["Other", "Unknown", "Pending"],
24+
}
25+
4626
# extracted from https://www.ice.gov/doclib/detention/FY25_detentionStats08292025.xlsx 2025-09-07
4727
ice_facility_types = {
4828
"BOP": {
4929
"expanded_name": "Federal Bureau of Prisons",
5030
"description": "A facility operated by the Federal Bureau of Prisons",
5131
},
32+
"CDF": {
33+
"expanded_name": "Contract Detention Facility",
34+
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
35+
},
5236
"DIGSA": {
5337
"expanded_name": "Dedicated Intergovernmental Service Agreement",
5438
"description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts to use all bed space via a Dedicated Intergovernmental Service Agreement; or facilities used by ICE pursuant to Intergovernmental Service Agreements, which house only ICE detainees – typically these are operated by private contractors pursuant to their agreements with local governments.",
5539
},
5640
"DOD": {
5741
"expanded_name": "Department of Defense",
58-
"description": "Military facility",
42+
"description": "Department of Defence facilities - Often Army bases",
43+
},
44+
"FAMILY": {
45+
"expanded_name": "Family",
46+
"description": "A facility in which families are able to remain together while awaiting their proceedings",
47+
},
48+
"Family": {
49+
"expanded_name": "Family",
50+
"description": "A facility in which families are able to remain together while awaiting their proceedings",
51+
},
52+
"Hospital": {
53+
"expanded_name": "Hospital",
54+
"description": "A medical facility",
5955
},
6056
"IGSA": {
6157
"expanded_name": "Intergovernmental Service Agreement",
6258
"description": "A publicly-owned facility operated by state/local government(s), or private contractors, in which ICE contracts for bed space via an Intergovernmental Service Agreement; or local jails used by ICE pursuant to Intergovernmental Service Agreements, which house both ICE and non-ICE detainees, typically county prisoners awaiting trial or serving short sentences, but sometimes also USMS prisoners.",
6359
},
60+
"Juvenile": {
61+
"expanded_name": "Juvenile",
62+
"description": "An IGSA facility capable of housing juveniles (separate from adults) for a temporary period of time",
63+
},
64+
"Other": {
65+
"expanded_name": "Other",
66+
"description": "Facilities including but not limited to transportation-related facilities, hotels, and/or other facilities",
67+
},
6468
"SPC": {
6569
"expanded_name": "Service Processing Center",
6670
"description": "A facility owned by the government and staffed by a combination of federal and contract employees.",
6771
},
72+
"Unknown": {
73+
"expanded_name": "Unknown",
74+
"description": "A facility whose type could not be identified",
75+
},
6876
"USMS": {
6977
"expanded_name": "United States Marshals Service",
7078
"description": "A facility primarily contracted with the USMS for housing of USMS detainees, in which ICE contracts with the USMS for bed space.",
@@ -82,10 +90,6 @@
8290
"expanded_name": "United States Marshals Service Contract Detention Facility",
8391
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
8492
},
85-
"CDF": {
86-
"expanded_name": "Contract Detention Facility",
87-
"description": "Name derived from listing at https://www.vera.org/ice-detention-trends",
88-
},
8993
"Staging": {
9094
"description": "Some facilities in the ICE spreadsheet are marked 'Staging'. Hard to determine why.",
9195
"expanded_name": "Staging",
@@ -131,6 +135,7 @@
131135
repair_locality, # noqa: F401
132136
repair_street, # noqa: F401
133137
repair_zip, # noqa: F401
138+
repair_name, # noqa: F401
134139
special_facilities, # noqa: F401
135140
update_facility, # noqa: F401
136141
)

ice_scrapers/custom_facilities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
Handle manually discovered/managed facilities
55
"""
6-
custom_facilities = {
6+
custom_facilities: dict = {
77
"2309 North Highway 83,McCook,NE,69001": {
88
"_repaired_record": False,
99
"address": {

ice_scrapers/facilities_scraper.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
repair_locality,
88
repair_street,
99
repair_zip,
10+
repair_name,
1011
special_facilities,
1112
update_facility,
1213
)
@@ -29,6 +30,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
2930
facilities_data["scraped_date"] = datetime.datetime.now(datetime.UTC)
3031
urls = get_ice_scrape_pages(base_scrape_url)
3132

33+
scraped_count = 0
3234
for page_num, url in enumerate(urls):
3335
logger.info("Scraping page %s/%s...", page_num + 1, len(urls))
3436
try:
@@ -37,6 +39,7 @@ def scrape_facilities(facilities_data: dict) -> dict:
3739
logger.error("Error scraping page %s: %s", page_num + 1, e)
3840
logger.debug("Found %s facilities on page %s", len(facilities), page_num + 1)
3941
time.sleep(1) # Be respectful to the server
42+
scraped_count += len(facilities)
4043
for facility in facilities:
4144
facility = special_facilities(facility)
4245
addr = facility["address"]
@@ -52,6 +55,10 @@ def scrape_facilities(facilities_data: dict) -> dict:
5255
if cleaned:
5356
addr["locality"] = locality
5457
facility["_repaired_record"] = True
58+
name, cleaned = repair_name(facility["name"], addr["locality"])
59+
if cleaned:
60+
facility["name"] = name
61+
facility["_repaired_record"] = True
5562
full_address = ",".join([street, locality, addr["administrative_area"], zcode]).upper()
5663
if not facility["address_str"]:
5764
facility["address_str"] = full_address
@@ -73,12 +80,12 @@ def scrape_facilities(facilities_data: dict) -> dict:
7380
facilities_data["facilities"][facility["name"]] = facility # type: ignore [index]
7481

7582
facilities_data["scrape_runtime"] = time.time() - start_time
76-
logger.info("Total facilities scraped: %s", len(list(facilities_data["facilities"].keys()))) # type: ignore [attr-defined]
83+
logger.info("Total facilities scraped: %s", scraped_count)
7784
logger.info(" Completed in %s seconds", facilities_data["scrape_runtime"])
7885
return facilities_data
7986

8087

81-
def _scrape_updated(url: str):
88+
def _scrape_updated(url: str) -> datetime.datetime:
8289
"""
8390
Scrape url to get "last updated" time
8491
Is specifically oriented around ice.gov facility pages
@@ -92,7 +99,7 @@ def _scrape_updated(url: str):
9299
response.raise_for_status()
93100
except Exception as e:
94101
logger.error(" Error parsing %s: %s", url, e)
95-
return []
102+
return datetime.datetime.strptime(default_timestamp, timestamp_format)
96103
soup = BeautifulSoup(response.content, "html.parser")
97104
times = soup.findAll("time")
98105
if not times:
@@ -176,7 +183,6 @@ def _scrape_page(page_url: str) -> list:
176183
facilities.append(facility_data)
177184

178185
logger.info(" Extracted %s facilities from page", len(facilities))
179-
180186
return facilities
181187

182188

ice_scrapers/general.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
from schemas import facilities_schema
1010

1111

12-
def facilities_scrape_wrapper() -> dict:
12+
def facilities_scrape_wrapper(keep_sheet: bool = True, force_download: bool = True) -> dict:
1313
facilities_data = copy.deepcopy(facilities_schema)
14-
facilities = load_sheet()
14+
facilities = load_sheet(keep_sheet, force_download)
1515
facilities_data["facilities"] = copy.deepcopy(facilities)
1616
facilities_data = scrape_facilities(facilities_data)
1717
field_offices = scrape_field_offices()

0 commit comments

Comments
 (0)