Skip to content

Commit 4ce7a7c

Browse files
Example notebook for Gaming industry (#362)
* Add files via upload A notebook that generates synthetic log in data for various gamers. The gamers can have multiple devices that are consistent across them and are located in different areas. * Update VideoGameLoginSyntheticDataGeneration.py Fixed code with comments from PR * Update and rename VideoGameLoginSyntheticDataGeneration.py to gaming_data_generation.py
1 parent 21613e2 commit 4ce7a7c

File tree

1 file changed

+266
-0
lines changed

1 file changed

+266
-0
lines changed
Lines changed: 266 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
# Databricks notebook source
2+
# MAGIC %md
3+
# MAGIC
4+
# MAGIC # Getting Started with the Databricks Labs Data Generator
5+
# MAGIC This notebook provides an introduction to synthetic data generation using the [Databricks Labs Data Generator (`dbldatagen`)](https://databrickslabs.github.io/dbldatagen/public_docs/index.html). This data generator is useful for generating large synthetic datasets for development, testing, benchmarking, proofs-of-concept, and other use-cases.
6+
# MAGIC
7+
# MAGIC The notebook simulates data for a user login scenario for the gaming industry.
8+
9+
# COMMAND ----------
10+
11+
# DBTITLE 1,Install dbldatagen
12+
# dbldatagen can be installed using pip install commands, as a cluster-scoped library, or as a serverless environment-scoped library.
13+
%pip install dbldatagen
14+
15+
# COMMAND ----------
16+
17+
# DBTITLE 1,Import Modules
18+
import dbldatagen as dg
19+
20+
from pyspark.sql.types import DoubleType, StringType, TimestampType, LongType
21+
from pyspark.sql.functions import col, expr, sha2, to_date, hour
22+
23+
# COMMAND ----------
24+
25+
# DBTITLE 1,Set up Parameters
26+
# Set up how many rows we want along with how many users, devices and IPs we want
27+
ROW_COUNT = 4500000
28+
NUMBER_OF_USERS = 200000
29+
NUMBER_OF_DEVICES = NUMBER_OF_USERS + 50000
30+
NUMBER_OF_IPS = 40000
31+
32+
START_TIMESTAMP = "2025-03-01 00:00:00"
33+
END_TIMESTAMP = "2025-03-30 00:00:00"
34+
35+
# COMMAND ----------
36+
37+
# MAGIC %md
38+
# MAGIC ## Data Generation Specifications
39+
# MAGIC
40+
# MAGIC Let's start by generating a DataFrame with rows representing unique login information. Data generation is controlled by a `DataGenerator` object. Each `DataGenerator` can be extended with rules specifying the output schema and value generation. Columns can be defined using `withColumn(...)` with a variety of parameters.
41+
# MAGIC
42+
# MAGIC **colName** – Name of column to add. If this conflicts with the underlying seed column (id), it is recommended that the seed column name is customized during the construction of the data generator spec.
43+
# MAGIC
44+
# MAGIC **colType** – Data type for column. This may be specified as either a type from one of the possible pyspark.sql.types (e.g. StringType, DecimalType(10,3) etc) or as a string containing a Spark SQL type definition (i.e String, array<Integer>, map<String, Float>)
45+
# MAGIC
46+
# MAGIC **omit** – if True, the column will be omitted from the final set of columns in the generated data. Used to create columns that are used by other columns as intermediate results. Defaults to False
47+
# MAGIC
48+
# MAGIC **expr** – Specifies SQL expression used to create column value. If specified, overrides the default rules for creating column value. Defaults to None
49+
# MAGIC
50+
# MAGIC **baseColumn** – String or list of columns to control order of generation of columns. If not specified, column is dependent on base seed column (which defaults to id)
51+
52+
# COMMAND ----------
53+
54+
# DBTITLE 1,Generate a DataFrame
55+
default_annotations_spec = (
56+
dg.DataGenerator(spark, name="default_annotations_spec", rows=ROW_COUNT)
57+
.withColumn(
58+
"EVENT_TIMESTAMP",
59+
TimestampType(),
60+
data_range=dg.DateRange(START_TIMESTAMP, END_TIMESTAMP, "seconds=1"),
61+
random=True,
62+
) # Random event timestamp within the specified range
63+
.withColumn(
64+
"internal_ACCOUNTID",
65+
LongType(),
66+
minValue=0x1000000000000,
67+
uniqueValues=NUMBER_OF_USERS,
68+
omit=True,
69+
baseColumnType="hash",
70+
) # Internal unique account id, omitted from output, used for deterministic hashing
71+
.withColumn(
72+
"ACCOUNTID", StringType(), format="0x%032x", baseColumn="internal_ACCOUNTID"
73+
) # Public account id as hex string
74+
.withColumn(
75+
"internal_DEVICEID",
76+
LongType(),
77+
minValue=0x1000000000000,
78+
uniqueValues=NUMBER_OF_DEVICES,
79+
omit=True,
80+
baseColumnType="hash",
81+
baseColumn="internal_ACCOUNTID",
82+
) # Internal device id, based on account, omitted from output
83+
.withColumn(
84+
"DEVICEID", StringType(), format="0x%032x", baseColumn="internal_DEVICEID"
85+
) # Public device id as hex string
86+
.withColumn("APP_VERSION", StringType(), values=["current"]) # Static app version
87+
.withColumn(
88+
"AUTHMETHOD", StringType(), values=["OAuth", "password"]
89+
) # Auth method, random selection
90+
# Assign clientName based on DEVICEID deterministically
91+
.withColumn(
92+
"CLIENTNAME",
93+
StringType(),
94+
expr="""
95+
element_at(
96+
array('SwitchGameClient','XboxGameClient','PlaystationGameClient','PCGameClient'),
97+
(pmod(abs(hash(DEVICEID)), 4) + 1)
98+
)
99+
""",
100+
)
101+
.withColumn(
102+
"CLIENTID",
103+
StringType(),
104+
expr="sha2(concat(ACCOUNTID, CLIENTNAME), 256)",
105+
baseColumn=["ACCOUNTID", "CLIENTNAME"],
106+
) # Deterministic clientId based on ACCOUNTID and clientName
107+
.withColumn(
108+
"SESSION_ID",
109+
StringType(),
110+
expr="sha2(concat(ACCOUNTID, CLIENTID), 256)",
111+
) # Session correlation id, deterministic hash
112+
.withColumn(
113+
"country",
114+
StringType(),
115+
values=["USA", "UK", "AUS"],
116+
weights=[0.6, 0.2, 0.2],
117+
baseColumn="ACCOUNTID",
118+
random=True,
119+
) # Assign country with 60% USA, 20% UK, 20% AUS
120+
.withColumn(
121+
"APPENV", StringType(), values=["prod"]
122+
) # Static environment value
123+
.withColumn(
124+
"EVENT_TYPE", StringType(), values=["account_login_success"]
125+
) # Static event type
126+
# Assign geoip_city_name based on country and ACCOUNTID
127+
.withColumn(
128+
"CITY",
129+
StringType(),
130+
expr="""
131+
CASE
132+
WHEN country = 'USA' THEN element_at(array('New York', 'San Francisco', 'Chicago'), pmod(abs(hash(ACCOUNTID)), 3) + 1)
133+
WHEN country = 'UK' THEN 'London'
134+
WHEN country = 'AUS' THEN 'Sydney'
135+
END
136+
""",
137+
baseColumn=["country", "ACCOUNTID"],
138+
)
139+
.withColumn(
140+
"COUNTRY_CODE2",
141+
StringType(),
142+
expr="CASE WHEN country = 'USA' THEN 'US' WHEN country = 'UK' THEN 'UK' WHEN country = 'AUS' THEN 'AU' END",
143+
baseColumn=["country"],
144+
) # Country code
145+
# Assign ISP based on country and ACCOUNTID
146+
.withColumn(
147+
"ISP",
148+
StringType(),
149+
expr="""
150+
CASE
151+
WHEN country = 'USA' THEN element_at(array('Comcast', 'AT&T', 'Verizon', 'Spectrum', 'Cox'), pmod(abs(hash(ACCOUNTID)), 5) + 1)
152+
WHEN country = 'UK' THEN element_at(array('BT', 'Sky', 'Virgin Media', 'TalkTalk', 'EE'), pmod(abs(hash(ACCOUNTID)), 5) + 1)
153+
WHEN country = 'AUS' THEN element_at(array('Telstra', 'Optus', 'TPG', 'Aussie Broadband', 'iiNet'), pmod(abs(hash(ACCOUNTID)), 5) + 1)
154+
ELSE 'Unknown ISP'
155+
END
156+
""",
157+
baseColumn=["country", "ACCOUNTID"],
158+
)
159+
# Assign latitude based on city
160+
.withColumn(
161+
"LATITUDE",
162+
DoubleType(),
163+
expr="""
164+
CASE
165+
WHEN CITY = 'New York' THEN 40.7128
166+
WHEN CITY = 'San Francisco' THEN 37.7749
167+
WHEN CITY = 'Chicago' THEN 41.8781
168+
WHEN CITY = 'London' THEN 51.5074
169+
WHEN CITY = 'Sydney' THEN -33.8688
170+
ELSE 0.0
171+
END
172+
""",
173+
baseColumn="CITY",
174+
)
175+
# Assign longitude based on city
176+
.withColumn(
177+
"LONGITUDE",
178+
DoubleType(),
179+
expr="""
180+
CASE
181+
WHEN CITY = 'New York' THEN -74.0060
182+
WHEN CITY = 'San Francisco' THEN -122.4194
183+
WHEN CITY = 'Chicago' THEN -87.6298
184+
WHEN CITY = 'London' THEN -0.1278
185+
WHEN CITY = 'Sydney' THEN 151.2093
186+
ELSE 0.0
187+
END
188+
""",
189+
baseColumn="CITY",
190+
)
191+
# Assign region name based on country and city
192+
.withColumn(
193+
"REGION_NAME",
194+
StringType(),
195+
expr="""
196+
CASE
197+
WHEN country = 'USA' THEN
198+
CASE
199+
WHEN CITY = 'New York' THEN 'New York'
200+
WHEN CITY = 'San Francisco' THEN 'California'
201+
WHEN CITY = 'Chicago' THEN 'Illinois'
202+
ELSE 'Unknown'
203+
END
204+
WHEN country = 'UK' THEN 'England'
205+
WHEN country = 'AUS' THEN 'New South Wales'
206+
ELSE 'Unknown'
207+
END
208+
""",
209+
baseColumn=["country", "CITY"],
210+
)
211+
# Internal IP address as integer, unique per device, omitted from output
212+
.withColumn(
213+
"internal_REQUESTIPADDRESS",
214+
LongType(),
215+
minValue=0x1000000000000,
216+
uniqueValues=NUMBER_OF_IPS,
217+
omit=True,
218+
baseColumnType="hash",
219+
baseColumn="internal_DEVICEID",
220+
)
221+
# Convert internal IP integer to dotted quad string
222+
.withColumn(
223+
"REQUESTIPADDRESS",
224+
StringType(),
225+
expr="""
226+
concat(
227+
cast((internal_REQUESTIPADDRESS >> 24) & 255 as string), '.',
228+
cast((internal_REQUESTIPADDRESS >> 16) & 255 as string), '.',
229+
cast((internal_REQUESTIPADDRESS >> 8) & 255 as string), '.',
230+
cast(internal_REQUESTIPADDRESS & 255 as string)
231+
)
232+
""",
233+
baseColumn="internal_REQUESTIPADDRESS",
234+
)
235+
# Generate user agent string using clientName and SESSION_ID
236+
.withColumn(
237+
"USERAGENT",
238+
StringType(),
239+
expr="concat('Launch/1.0+', CLIENTNAME, '(', CLIENTNAME, '/)/', SESSION_ID)",
240+
baseColumn=["CLIENTNAME", "SESSION_ID"],
241+
)
242+
)
243+
# Build creates a DataFrame from the DataGenerator
244+
default_logins_df = default_annotations_spec.build()
245+
246+
# COMMAND ----------
247+
248+
# DBTITLE 1,Transform the Dataframe
249+
logins_df = default_logins_df.withColumn(
250+
"EVENT_HOUR", hour(col("EVENT_TIMESTAMP"))
251+
).withColumn("EVENT_DATE", to_date(col("EVENT_TIMESTAMP")))
252+
253+
# COMMAND ----------
254+
255+
# DBTITLE 1,Look at the Data
256+
display(logins_df)
257+
258+
# COMMAND ----------
259+
260+
# MAGIC %md
261+
# MAGIC # Write Data
262+
263+
# COMMAND ----------
264+
265+
266+
transformed_df.write.mode("overwrite").saveAsTable("main.test.EVENT_ACCOUNT_LOGIN_SUCCESS")

0 commit comments

Comments
 (0)