Skip to content

Would you like to test the Amazon dataset but find it difficult to do so? #8

@kkkkobe7

Description

@kkkkobe7

download the data you want from https://cseweb.ucsd.edu/~jmcauley/datasets/amazon/links.html, choose the 5-core format, and adjust the code below to process the raw data into the format required by the TiSASRec model.

import json
import os
from collections import defaultdict

# Define file paths
base_dir = os.path.join(os.path.dirname(__file__), 'raw_data', 'Beauty')
input_file = os.path.join(base_dir, 'Beauty_5.json')
output_txt_file = os.path.join(base_dir, 'Beauty.txt')
reviewer_mapping_file = os.path.join(base_dir, 'user_map.json')
asin_mapping_file = os.path.join(base_dir, 'item_map.json')


# Read and process JSON data
def process_reviews(input_file):
    user_mapping = {}  # Mapping from reviewerID to user ID
    item_mapping = {}  # Mapping from asin to item ID
    user_counter = 1  # User IDs start from 1
    item_counter = 1  # Item IDs start from 1

    user_reviews = defaultdict(list)  # Store reviews grouped by reviewerID
    data_lines = []  # To store each line of output data

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            review = json.loads(line.strip())

            reviewerID = review.get('reviewerID')
            asin = review.get('asin')
            overall = review.get('overall')
            unixReviewTime = review.get('unixReviewTime')

            # Map reviewerID to user ID
            if reviewerID not in user_mapping:
                user_mapping[reviewerID] = user_counter
                user_counter += 1

            # Map asin to item ID
            if asin not in item_mapping:
                item_mapping[asin] = item_counter
                item_counter += 1

            # Get user ID and item ID
            user_id = user_mapping[reviewerID]
            item_id = item_mapping[asin]

            # Group review data by user ID
            user_reviews[user_id].append((item_id, int(overall), unixReviewTime))

    # Write each user's reviews to the output file in order
    for user_id in sorted(user_reviews.keys()):
        for item_id, rating, time in user_reviews[user_id]:
            data_lines.append(f"{user_id}\t{item_id}\t{rating}\t{time}")

    return data_lines, user_mapping, item_mapping


# Save the output text file
def save_output_txt(output_txt_file, data_lines):
    with open(output_txt_file, 'w', encoding='utf-8') as f:
        for line in data_lines:
            f.write(line + '\n')
    print(f"Data has been saved to {output_txt_file}")


# Save the mapping files
def save_mappings(reviewer_mapping_file, asin_mapping_file, user_mapping, item_mapping):
    with open(reviewer_mapping_file, 'w', encoding='utf-8') as f:
        json.dump(user_mapping, f, ensure_ascii=False, indent=4)
    with open(asin_mapping_file, 'w', encoding='utf-8') as f:
        json.dump(item_mapping, f, ensure_ascii=False, indent=4)
    print(f"Mapping files have been saved to {reviewer_mapping_file} and {asin_mapping_file}")


def main():
    # Process JSON data
    data_lines, user_mapping, item_mapping = process_reviews(input_file)

    # Save data to txt file
    save_output_txt(output_txt_file, data_lines)

    # Save mapping files to JSON
    save_mappings(reviewer_mapping_file, asin_mapping_file, user_mapping, item_mapping)


if __name__ == "__main__":
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions