Skip to content

Commit 9193979

Browse files
committed
update starrocks result to 4.0.0-rc01
Signed-off-by: Murphy <mofei@starrocks.com>
1 parent 863bf47 commit 9193979

16 files changed

Lines changed: 158 additions & 24 deletions

starrocks/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.log
2+
tmp/

starrocks/count.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,7 @@ fi
1212
DB_NAME="$1"
1313
TABLE_NAME="$2"
1414

15-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "SELECT count() FROM $TABLE_NAME;"
15+
# Load shared environment
16+
source "$(dirname "$0")/env.sh"
17+
18+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "SELECT count() FROM $TABLE_NAME;"

starrocks/create_and_load.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
# If you change something in this file, please change also in doris/create_and_load.sh.
44

5+
# Load shared environment
6+
source "$(dirname "$0")/env.sh"
7+
58
# Check if the required arguments are provided
69
if [[ $# -lt 6 ]]; then
710
echo "Usage: $0 <DB_NAME> <TABLE_NAME> <DATA_DIRECTORY> <NUM_FILES> <SUCCESS_LOG> <ERROR_LOG>"
@@ -15,17 +18,18 @@ DATA_DIRECTORY="$3"
1518
NUM_FILES="$4"
1619
SUCCESS_LOG="$5"
1720
ERROR_LOG="$6"
21+
DDL_FILE="ddl.sql"
1822

1923
# Validate arguments
2024
[[ ! -d "$DATA_DIRECTORY" ]] && { echo "Error: Data directory '$DATA_DIRECTORY' does not exist."; exit 1; }
2125
[[ ! "$NUM_FILES" =~ ^[0-9]+$ ]] && { echo "Error: NUM_FILES must be a positive integer."; exit 1; }
2226

2327

2428
echo "Create database"
25-
mysql -P 9030 -h 127.0.0.1 -u root -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
29+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" -e "CREATE DATABASE IF NOT EXISTS $DB_NAME"
2630

2731
echo "Execute DDL"
28-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME < "ddl.sql"
32+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" < "$DDL_FILE"
2933

3034
echo "Load data"
3135
./load_data.sh "$DATA_DIRECTORY" "$DB_NAME" "$TABLE_NAME" "$NUM_FILES" "$SUCCESS_LOG" "$ERROR_LOG"

starrocks/ddl.sql

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
11
CREATE TABLE bluesky (
22
`id` BIGINT AUTO_INCREMENT,
3-
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON"
4-
);
3+
`data` JSON NOT NULL COMMENT "Primary JSON object, optimized for field access using FlatJSON",
4+
5+
sort_key VARBINARY AS encode_sort_key(
6+
get_json_string(data, 'kind'),
7+
get_json_string(data, 'commit.operation'),
8+
get_json_string(data, 'commit.collection'),
9+
get_json_string(data, 'did')
10+
)
11+
)
12+
DISTRIBUTED BY HASH(sort_key) BUCKETS 128
13+
ORDER BY (sort_key);

starrocks/drop_table.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,7 @@ DB_NAME="$1"
1212
TABLE_NAME="$2"
1313

1414
echo "Dropping table: $DB_NAME.$TABLE_NAME"
15-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "DROP TABLE IF EXISTS $TABLE_NAME"
15+
# Load shared environment
16+
source "$(dirname "$0")/env.sh"
17+
18+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "DROP TABLE IF EXISTS $TABLE_NAME"

starrocks/env.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
3+
# Shared configuration for StarRocks connection
4+
# MySQL-compatible query endpoint
5+
export DB_HOST="127.0.0.1"
6+
export DB_MYSQL_PORT="9030"
7+
export DB_USER="root"
8+
9+
# HTTP endpoint for Stream Load
10+
export DB_HTTP_PORT="8030"

starrocks/load_data.sh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
#!/bin/bash
22

3+
# Load shared environment
4+
source "$(dirname "$0")/env.sh"
5+
36
# Check if the required arguments are provided
47
if [[ $# -lt 6 ]]; then
58
echo "Usage: $0 <DATA_DIRECTORY> <DB_NAME> <TABLE_NAME> <MAX_FILES> <SUCCESS_LOG> <ERROR_LOG>"
@@ -36,14 +39,25 @@ for file in $(ls "$DATA_DIRECTORY"/*.json.gz | head -n "$MAX_FILES"); do
3639
echo "Error: Failed to uncompress $file" >> "$ERROR_LOG"
3740
continue
3841
fi
39-
MAX_ATTEMPT=10
42+
43+
MAX_ATTEMPT=1
4044
attempt=0
4145
while [ $attempt -lt $MAX_ATTEMPT ]
4246
do
43-
# Attempt the import
44-
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body) --location-trusted -u root: -H "strict_mode: true" -H "Expect:100-continue" -H "columns: data" -T "$uncompressed_file" -XPUT http://127.0.0.1:8030/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
45-
response_body="$(cat /tmp/curl_body)"
46-
response_status="$(cat /tmp/curl_body | jq -r '.Status')"
47+
http_code=$(curl -s -w "%{http_code}" -o >(cat >/tmp/curl_body_$$) \
48+
--location-trusted -u root: \
49+
-H "max_filter_ratio: 0.00001" \
50+
-H "strict_mode: true" \
51+
-H "Expect:100-continue" \
52+
-T "$uncompressed_file" \
53+
-XPUT http://${DB_HOST}:${DB_HTTP_PORT}/api/"$DB_NAME"/"$TABLE_NAME"/_stream_load)
54+
response_body="$(cat /tmp/curl_body_$$)"
55+
if jq -e . >/dev/null 2>&1 < /tmp/curl_body_$$; then
56+
response_status="$(jq -r '.Status' < /tmp/curl_body_$$)"
57+
else
58+
response_status=""
59+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Invalid JSON response for $file: $(cat /tmp/curl_body_$$)" >> "$ERROR_LOG"
60+
fi
4761
echo $response_status
4862
if [[ "$http_code" -ge 200 && "$http_code" -lt 300 ]]; then
4963
if [ "$response_status" = "Success" ]

starrocks/physical_query_plans.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ fi
99
# Arguments
1010
DB_NAME="$1"
1111

12+
# Load shared environment
13+
source "$(dirname "$0")/env.sh"
14+
1215
QUERY_NUM=1
1316

1417
cat queries.sql | while read -r query; do
@@ -17,7 +20,7 @@ cat queries.sql | while read -r query; do
1720
echo "------------------------------------------------------------------------------------------------------------------------"
1821
echo "Physical query plan for query Q$QUERY_NUM:"
1922
echo
20-
mysql -P 9030 -h 127.0.0.1 -u root $DB_NAME -e "EXPLAIN $query"
23+
mysql -P "$DB_MYSQL_PORT" -h "$DB_HOST" -u "$DB_USER" "$DB_NAME" -e "EXPLAIN $query"
2124

2225
# Increment the query number
2326
QUERY_NUM=$((QUERY_NUM + 1))

starrocks/queries.sql

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC;
22
SELECT get_json_string(data, 'commit.collection') AS event, count() AS count, count(DISTINCT get_json_string(data, 'did')) AS users FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') GROUP BY event ORDER BY count DESC;
3-
SELECT get_json_string(data, 'commit.collection') AS event, hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4-
SELECT get_json_string(data, '$.did') as user_id, min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5-
SELECT get_json_string(data, '$.did') as user_id, date_diff('millisecond', min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))), max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;
3+
SELECT get_json_string(data, 'commit.collection') AS event, hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day, count() AS count FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (array_contains(['app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like'], get_json_string(data, 'commit.collection'))) GROUP BY event, hour_of_day ORDER BY hour_of_day, event;
4+
SELECT get_json_string(data, 'did') as user_id, to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY first_post_date ASC LIMIT 3;
5+
SELECT get_json_string(data, 'did') as user_id, date_diff('millisecond', to_datetime(min(get_json_int(data, 'time_us')), 6), to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span FROM bluesky WHERE (get_json_string(data, 'kind') = 'commit') AND (get_json_string(data, 'commit.operation') = 'create') AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post') GROUP BY user_id ORDER BY activity_span DESC LIMIT 3;

starrocks/queries_formatted.sql

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ ORDER BY count DESC;
2626
------------------------------------------------------------------------------------------------------------------------
2727
SELECT
2828
get_json_string(data, 'commit.collection') AS event,
29-
hour(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) as hour_of_day,
29+
hour_from_unixtime(get_json_int(data, 'time_us')/1000000) as hour_of_day,
3030
count() AS count
3131
FROM bluesky
3232
WHERE (get_json_string(data, 'kind') = 'commit')
@@ -39,8 +39,8 @@ ORDER BY hour_of_day, event;
3939
-- Q4 - top 3 post veterans
4040
------------------------------------------------------------------------------------------------------------------------
4141
SELECT
42-
get_json_string(data, '$.did') as user_id,
43-
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))) AS first_post_date
42+
get_json_string(data, 'did') as user_id,
43+
to_datetime(min(get_json_int(data, 'time_us')), 6) AS first_post_date
4444
FROM bluesky
4545
WHERE (get_json_string(data, 'kind') = 'commit')
4646
AND (get_json_string(data, 'commit.operation') = 'create')
@@ -53,11 +53,11 @@ LIMIT 3;
5353
-- Q5 - top 3 users with longest activity
5454
------------------------------------------------------------------------------------------------------------------------
5555
SELECT
56-
get_json_string(data, '$.did') as user_id,
56+
get_json_string(data, 'did') as user_id,
5757
date_diff('millisecond',
58-
min(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000)))),
59-
max(from_unixtime(round(divide(get_json_int(data, 'time_us'), 1000000))))) AS activity_span
60-
FROM bluesky
58+
to_datetime(min(get_json_int(data, 'time_us')), 6),
59+
to_datetime(max(get_json_int(data, 'time_us')), 6)) AS activity_span
60+
FROM bluesky_sorted
6161
WHERE (get_json_string(data, 'kind') = 'commit')
6262
AND (get_json_string(data, 'commit.operation') = 'create')
6363
AND (get_json_string(data, 'commit.collection') = 'app.bsky.feed.post')

0 commit comments

Comments
 (0)