解析 Markdown、CSV、JSON 等格式的大文件时,Rust 比 JavaScript 快不少。
文件解析的特点:
实测数据:
| 格式 | 文件大小 | JavaScript | Rust + WASM | 提升 |
|---|---|---|---|---|
| Markdown | 5MB | 920ms | 145ms | 6.3x |
| CSV | 10MB | 1850ms | 280ms | 6.6x |
| JSON | 5MB | 340ms | 95ms | 3.6x |
1[package]
2name = "file-parser"
3version = "0.1.0"
4edition = "2021"
5
6[lib]
7crate-type = ["cdylib"]
8
9[dependencies]
10wasm-bindgen = "0.2"
11serde = { version = "1.0", features = ["derive"] }
12serde-wasm-bindgen = "0.6"
13serde_json = "1.0"
14pulldown-cmark = { version = "0.9", default-features = false }
15
16[profile.release]
17opt-level = 3
18lto = true1use wasm_bindgen::prelude::*;
2use pulldown_cmark::{Parser, Options, html};
3
4#[wasm_bindgen]
5pub fn markdown_to_html(markdown: &str) -> String {
6 let mut options = Options::empty();
7 options.insert(Options::ENABLE_TABLES);
8 options.insert(Options::ENABLE_FOOTNOTES);
9 options.insert(Options::ENABLE_STRIKETHROUGH);
10 options.insert(Options::ENABLE_TASKLISTS);
11
12 let parser = Parser::new_ext(markdown, options);
13
14 let mut html_output = String::new();
15 html::push_html(&mut html_output, parser);
16
17 html_output
18}
19
20// 提取标题
21#[wasm_bindgen]
22pub struct Heading {
23 level: u32,
24 text: String,
25}
26
27#[wasm_bindgen]
28impl Heading {
29 #[wasm_bindgen(getter)]
30 pub fn level(&self) -> u32 { self.level }
31
32 #[wasm_bindgen(getter)]
33 pub fn text(&self) -> String { self.text.clone() }
34}
35
36#[wasm_bindgen]
37pub fn extract_headings(markdown: &str) -> Vec<Heading> {
38 use pulldown_cmark::{Event, Tag, HeadingLevel};
39
40 let parser = Parser::new(markdown);
41 let mut headings = Vec::new();
42 let mut in_heading = false;
43 let mut current_level = 1;
44 let mut current_text = String::new();
45
46 for event in parser {
47 match event {
48 Event::Start(Tag::Heading(level, ..)) => {
49 in_heading = true;
50 current_level = match level {
51 HeadingLevel::H1 => 1,
52 HeadingLevel::H2 => 2,
53 HeadingLevel::H3 => 3,
54 HeadingLevel::H4 => 4,
55 HeadingLevel::H5 => 5,
56 HeadingLevel::H6 => 6,
57 };
58 current_text.clear();
59 },
60 Event::End(Tag::Heading(..)) => {
61 if in_heading {
62 headings.push(Heading {
63 level: current_level,
64 text: current_text.clone(),
65 });
66 in_heading = false;
67 }
68 },
69 Event::Text(text) if in_heading => {
70 current_text.push_str(&text);
71 },
72 _ => {}
73 }
74 }
75
76 headings
77}
78
79// 字数统计
80#[wasm_bindgen]
81pub struct ReadingStats {
82 word_count: usize,
83 char_count: usize,
84 reading_time_minutes: usize,
85}
86
87#[wasm_bindgen]
88impl ReadingStats {
89 #[wasm_bindgen(getter)]
90 pub fn word_count(&self) -> usize { self.word_count }
91
92 #[wasm_bindgen(getter)]
93 pub fn char_count(&self) -> usize { self.char_count }
94
95 #[wasm_bindgen(getter)]
96 pub fn reading_time_minutes(&self) -> usize { self.reading_time_minutes }
97}
98
99#[wasm_bindgen]
100pub fn calculate_reading_stats(markdown: &str) -> ReadingStats {
101 let parser = Parser::new(markdown);
102 let mut word_count = 0;
103 let mut char_count = 0;
104
105 for event in parser {
106 if let Event::Text(text) = event {
107 for c in text.chars() {
108 char_count += 1;
109 if c.is_whitespace() {
110 word_count += 1;
111 } else if is_cjk(c) {
112 word_count += 1;
113 }
114 }
115 }
116 }
117
118 let reading_time_minutes = (word_count / 200).max(1);
119
120 ReadingStats { word_count, char_count, reading_time_minutes }
121}
122
123fn is_cjk(c: char) -> bool {
124 matches!(c,
125 '\u{4E00}'..='\u{9FFF}' |
126 '\u{3400}'..='\u{4DBF}' |
127 '\u{20000}'..='\u{2A6DF}'
128 )
129}1#[wasm_bindgen]
2pub struct CsvData {
3 headers: Vec<String>,
4 rows: Vec<Vec<String>>,
5}
6
7#[wasm_bindgen]
8impl CsvData {
9 pub fn get_headers(&self) -> JsValue {
10 serde_wasm_bindgen::to_value(&self.headers).unwrap()
11 }
12
13 pub fn get_rows(&self) -> JsValue {
14 serde_wasm_bindgen::to_value(&self.rows).unwrap()
15 }
16
17 pub fn row_count(&self) -> usize {
18 self.rows.len()
19 }
20}
21
22#[wasm_bindgen]
23pub fn parse_csv(csv_text: &str, delimiter: char) -> Result<CsvData, JsValue> {
24 let mut headers = Vec::new();
25 let mut rows = Vec::new();
26 let mut is_first_row = true;
27
28 for line in csv_text.lines() {
29 if line.trim().is_empty() {
30 continue;
31 }
32
33 let fields = parse_csv_line(line, delimiter);
34
35 if is_first_row {
36 headers = fields;
37 is_first_row = false;
38 } else {
39 if fields.len() != headers.len() {
40 return Err(JsValue::from_str(&format!(
41 "字段数量不对: 应该 {}, 实际 {}",
42 headers.len(),
43 fields.len()
44 )));
45 }
46 rows.push(fields);
47 }
48 }
49
50 Ok(CsvData { headers, rows })
51}
52
53fn parse_csv_line(line: &str, delimiter: char) -> Vec<String> {
54 let mut fields = Vec::new();
55 let mut current_field = String::new();
56 let mut in_quotes = false;
57 let mut chars = line.chars().peekable();
58
59 while let Some(c) = chars.next() {
60 match c {
61 '"' => {
62 if in_quotes && chars.peek() == Some(&'"') {
63 current_field.push('"');
64 chars.next();
65 } else {
66 in_quotes = !in_quotes;
67 }
68 },
69 c if c == delimiter && !in_quotes => {
70 fields.push(current_field.trim().to_string());
71 current_field.clear();
72 },
73 _ => {
74 current_field.push(c);
75 }
76 }
77 }
78
79 fields.push(current_field.trim().to_string());
80 fields
81}1use serde_json::Value;
2
3#[wasm_bindgen]
4pub fn format_json(json_str: &str, indent: usize) -> Result<String, JsValue> {
5 let value: Value = serde_json::from_str(json_str)
6 .map_err(|e| JsValue::from_str(&e.to_string()))?;
7
8 let formatter = serde_json::ser::PrettyFormatter::with_indent(&vec![b' '; indent]);
9 let mut buf = Vec::new();
10 let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter);
11
12 value.serialize(&mut ser)
13 .map_err(|e| JsValue::from_str(&e.to_string()))?;
14
15 String::from_utf8(buf)
16 .map_err(|e| JsValue::from_str(&e.to_string()))
17}
18
19#[wasm_bindgen]
20pub fn minify_json(json_str: &str) -> Result<String, JsValue> {
21 let value: Value = serde_json::from_str(json_str)
22 .map_err(|e| JsValue::from_str(&e.to_string()))?;
23
24 serde_json::to_string(&value)
25 .map_err(|e| JsValue::from_str(&e.to_string()))
26}1import init, {
2 markdown_to_html,
3 extract_headings,
4 calculate_reading_stats,
5} from "./pkg/file_parser.js";
6
7await init();
8
9const editor = document.getElementById("editor");
10const preview = document.getElementById("preview");
11const toc = document.getElementById("toc");
12
13// 防抖
14function debounce(func, wait) {
15 let timeout;
16 return function () {
17 clearTimeout(timeout);
18 timeout = setTimeout(() => func.apply(this, arguments), wait);
19 };
20}
21
22editor.addEventListener(
23 "input",
24 debounce(() => {
25 const markdown = editor.value;
26
27 // 转 HTML
28 const html = markdown_to_html(markdown);
29 preview.innerHTML = html;
30
31 // 生成目录
32 const headings = extract_headings(markdown);
33 toc.innerHTML = headings
34 .map(
35 (h) => `
36 <div style="margin-left: ${(h.level - 1) * 20}px">
37 ${h.text}
38 </div>
39 `
40 )
41 .join("");
42
43 // 统计信息
44 const stats = calculate_reading_stats(markdown);
45 document.getElementById("stats").innerHTML = `
46 字数: ${stats.word_count} |
47 字符: ${stats.char_count} |
48 阅读时间: ${stats.reading_time_minutes} 分钟
49 `;
50 }, 300)
51);1import { parse_csv } from "./pkg/file_parser.js";
2
3async function handleCsvFile(file) {
4 const text = await file.text();
5
6 const start = performance.now();
7 const csvData = parse_csv(text, ",");
8 const duration = performance.now() - start;
9
10 console.log(`解析完成 (${duration.toFixed(2)} ms)`);
11 console.log(`行数: ${csvData.row_count()}`);
12
13 const headers = csvData.get_headers();
14 const rows = csvData.get_rows();
15
16 renderTable(headers, rows);
17}
18
19function renderTable(headers, rows) {
20 const table = document.createElement("table");
21
22 // 表头
23 const thead = table.createTHead();
24 const headerRow = thead.insertRow();
25 headers.forEach((h) => {
26 const th = document.createElement("th");
27 th.textContent = h;
28 headerRow.appendChild(th);
29 });
30
31 // 数据
32 const tbody = table.createTBody();
33 rows.slice(0, 100).forEach((row) => {
34 const tr = tbody.insertRow();
35 row.forEach((cell) => {
36 const td = tr.insertCell();
37 td.textContent = cell;
38 });
39 });
40
41 document.getElementById("table-container").innerHTML = "";
42 document.getElementById("table-container").appendChild(table);
43}1import { format_json, minify_json } from "./pkg/file_parser.js";
2
3// 格式化
4document.getElementById("format").addEventListener("click", () => {
5 const input = document.getElementById("json-input").value;
6
7 try {
8 const formatted = format_json(input, 2);
9 document.getElementById("json-output").value = formatted;
10 } catch (e) {
11 alert("JSON 格式错误: " + e);
12 }
13});
14
15// 压缩
16document.getElementById("minify").addEventListener("click", () => {
17 const input = document.getElementById("json-input").value;
18
19 try {
20 const minified = minify_json(input);
21 document.getElementById("json-output").value = minified;
22
23 const saved = (1 - minified.length / input.length) * 100;
24 alert(`节省了 ${saved.toFixed(1)}% 空间`);
25 } catch (e) {
26 alert("JSON 格式错误: " + e);
27 }
28});分块读取:
1async function parseHugeFile(file) {
2 const CHUNK_SIZE = 1024 * 1024; // 1MB
3 let offset = 0;
4
5 while (offset < file.size) {
6 const chunk = file.slice(offset, offset + CHUNK_SIZE);
7 const text = await chunk.text();
8
9 processChunk(text);
10
11 offset += CHUNK_SIZE;
12 await new Promise((r) => setTimeout(r, 0));
13 }
14}避免阻塞主线程:
1const worker = new Worker("parser-worker.js");
2
3worker.postMessage({ type: "parse_csv", data: csvText });
4
5worker.onmessage = (e) => {
6 const result = e.data;
7 renderTable(result);
8};使用成熟的库:
1// pulldown-cmark 比手写快很多
2use pulldown_cmark::{Parser, html};
3
4pub fn markdown_to_html(md: &str) -> String {
5 let mut html = String::new();
6 html::push_html(&mut html, Parser::new(md));
7 html
8}1class MarkdownEditor {
2 constructor(textarea, preview) {
3 this.textarea = textarea;
4 this.preview = preview;
5
6 this.textarea.addEventListener(
7 "input",
8 debounce(() => this.render(), 300)
9 );
10 }
11
12 render() {
13 const markdown = this.textarea.value;
14 const html = markdown_to_html(markdown);
15 this.preview.innerHTML = html;
16
17 // 同步滚动
18 this.syncScroll();
19 }
20
21 syncScroll() {
22 const scrollPercent =
23 this.textarea.scrollTop /
24 (this.textarea.scrollHeight - this.textarea.clientHeight);
25
26 this.preview.scrollTop =
27 scrollPercent * (this.preview.scrollHeight - this.preview.clientHeight);
28 }
29}1async function analyzeCSV(file) {
2 const text = await file.text();
3 const data = parse_csv(text, ",");
4
5 const headers = data.get_headers();
6 const rows = data.get_rows();
7
8 // 找数值列
9 const numericColumns = headers.filter((h, i) => {
10 return rows.every((row) => !isNaN(parseFloat(row[i])));
11 });
12
13 console.log("数值列:", numericColumns);
14
15 // 可以进一步做统计分析
16 for (const col of numericColumns) {
17 const colIndex = headers.indexOf(col);
18 const values = rows.map((row) => parseFloat(row[colIndex]));
19
20 const stats = calculate_statistics(new Float64Array(values));
21 console.log(`${col}: 均值 ${stats.mean.toFixed(2)}`);
22 }
23}适合:
不适合:
JSON.parse)