-
Notifications
You must be signed in to change notification settings - Fork 18
Open
Description
download the data you want from https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html, choose the 5-core format, and adjust the code below to process the raw data into the format required by the TiSASRec model.
import json
import os
from collections import defaultdict
# Define file paths
base_dir = os.path.join(os.path.dirname(__file__), 'raw_data', 'Beauty')
input_file = os.path.join(base_dir, 'Beauty_5.json')
output_txt_file = os.path.join(base_dir, 'Beauty.txt')
reviewer_mapping_file = os.path.join(base_dir, 'user_map.json')
asin_mapping_file = os.path.join(base_dir, 'item_map.json')
# Read and process JSON data
def process_reviews(input_file):
user_mapping = {} # Mapping from reviewerID to user ID
item_mapping = {} # Mapping from asin to item ID
user_counter = 1 # User IDs start from 1
item_counter = 1 # Item IDs start from 1
user_reviews = defaultdict(list) # Store reviews grouped by reviewerID
data_lines = [] # To store each line of output data
with open(input_file, 'r', encoding='utf-8') as f:
for line in f:
review = json.loads(line.strip())
reviewerID = review.get('reviewerID')
asin = review.get('asin')
overall = review.get('overall')
unixReviewTime = review.get('unixReviewTime')
# Map reviewerID to user ID
if reviewerID not in user_mapping:
user_mapping[reviewerID] = user_counter
user_counter += 1
# Map asin to item ID
if asin not in item_mapping:
item_mapping[asin] = item_counter
item_counter += 1
# Get user ID and item ID
user_id = user_mapping[reviewerID]
item_id = item_mapping[asin]
# Group review data by user ID
user_reviews[user_id].append((item_id, int(overall), unixReviewTime))
# Write each user's reviews to the output file in order
for user_id in sorted(user_reviews.keys()):
for item_id, rating, time in user_reviews[user_id]:
data_lines.append(f"{user_id}\t{item_id}\t{rating}\t{time}")
return data_lines, user_mapping, item_mapping
# Save the output text file
def save_output_txt(output_txt_file, data_lines):
with open(output_txt_file, 'w', encoding='utf-8') as f:
for line in data_lines:
f.write(line + '\n')
print(f"Data has been saved to {output_txt_file}")
# Save the mapping files
def save_mappings(reviewer_mapping_file, asin_mapping_file, user_mapping, item_mapping):
with open(reviewer_mapping_file, 'w', encoding='utf-8') as f:
json.dump(user_mapping, f, ensure_ascii=False, indent=4)
with open(asin_mapping_file, 'w', encoding='utf-8') as f:
json.dump(item_mapping, f, ensure_ascii=False, indent=4)
print(f"Mapping files have been saved to {reviewer_mapping_file} and {asin_mapping_file}")
def main():
# Process JSON data
data_lines, user_mapping, item_mapping = process_reviews(input_file)
# Save data to txt file
save_output_txt(output_txt_file, data_lines)
# Save mapping files to JSON
save_mappings(reviewer_mapping_file, asin_mapping_file, user_mapping, item_mapping)
if __name__ == "__main__":
main()
Metadata
Metadata
Assignees
Labels
No labels