文件解析

解析 Markdown、CSV、JSON 等格式的大文件时,Rust 比 JavaScript 快不少。

为什么需要 WASM

文件解析的特点:

  • 大量字符串操作
  • 状态机逻辑
  • 需要遍历整个文件
  • JavaScript 字符串处理相对较慢

性能对比

实测数据:

格式 文件大小 JavaScript Rust + WASM 提升
Markdown 5MB 920ms 145ms 6.3x
CSV 10MB 1850ms 280ms 6.6x
JSON 5MB 340ms 95ms 3.6x

Rust 实现

Cargo.toml

1[package]
2name = "file-parser"
3version = "0.1.0"
4edition = "2021"
5
6[lib]
7crate-type = ["cdylib"]
8
9[dependencies]
10wasm-bindgen = "0.2"
11serde = { version = "1.0", features = ["derive"] }
12serde-wasm-bindgen = "0.6"
13serde_json = "1.0"
14pulldown-cmark = { version = "0.9", default-features = false }
15
16[profile.release]
17opt-level = 3
18lto = true

Markdown 解析

1use wasm_bindgen::prelude::*;
2use pulldown_cmark::{Parser, Options, html};
3
4#[wasm_bindgen]
5pub fn markdown_to_html(markdown: &str) -> String {
6    let mut options = Options::empty();
7    options.insert(Options::ENABLE_TABLES);
8    options.insert(Options::ENABLE_FOOTNOTES);
9    options.insert(Options::ENABLE_STRIKETHROUGH);
10    options.insert(Options::ENABLE_TASKLISTS);
11
12    let parser = Parser::new_ext(markdown, options);
13
14    let mut html_output = String::new();
15    html::push_html(&mut html_output, parser);
16
17    html_output
18}
19
20// 提取标题
21#[wasm_bindgen]
22pub struct Heading {
23    level: u32,
24    text: String,
25}
26
27#[wasm_bindgen]
28impl Heading {
29    #[wasm_bindgen(getter)]
30    pub fn level(&self) -> u32 { self.level }
31
32    #[wasm_bindgen(getter)]
33    pub fn text(&self) -> String { self.text.clone() }
34}
35
36#[wasm_bindgen]
37pub fn extract_headings(markdown: &str) -> Vec<Heading> {
38    use pulldown_cmark::{Event, Tag, HeadingLevel};
39
40    let parser = Parser::new(markdown);
41    let mut headings = Vec::new();
42    let mut in_heading = false;
43    let mut current_level = 1;
44    let mut current_text = String::new();
45
46    for event in parser {
47        match event {
48            Event::Start(Tag::Heading(level, ..)) => {
49                in_heading = true;
50                current_level = match level {
51                    HeadingLevel::H1 => 1,
52                    HeadingLevel::H2 => 2,
53                    HeadingLevel::H3 => 3,
54                    HeadingLevel::H4 => 4,
55                    HeadingLevel::H5 => 5,
56                    HeadingLevel::H6 => 6,
57                };
58                current_text.clear();
59            },
60            Event::End(Tag::Heading(..)) => {
61                if in_heading {
62                    headings.push(Heading {
63                        level: current_level,
64                        text: current_text.clone(),
65                    });
66                    in_heading = false;
67                }
68            },
69            Event::Text(text) if in_heading => {
70                current_text.push_str(&text);
71            },
72            _ => {}
73        }
74    }
75
76    headings
77}
78
79// 字数统计
80#[wasm_bindgen]
81pub struct ReadingStats {
82    word_count: usize,
83    char_count: usize,
84    reading_time_minutes: usize,
85}
86
87#[wasm_bindgen]
88impl ReadingStats {
89    #[wasm_bindgen(getter)]
90    pub fn word_count(&self) -> usize { self.word_count }
91
92    #[wasm_bindgen(getter)]
93    pub fn char_count(&self) -> usize { self.char_count }
94
95    #[wasm_bindgen(getter)]
96    pub fn reading_time_minutes(&self) -> usize { self.reading_time_minutes }
97}
98
99#[wasm_bindgen]
100pub fn calculate_reading_stats(markdown: &str) -> ReadingStats {
101    let parser = Parser::new(markdown);
102    let mut word_count = 0;
103    let mut char_count = 0;
104
105    for event in parser {
106        if let Event::Text(text) = event {
107            for c in text.chars() {
108                char_count += 1;
109                if c.is_whitespace() {
110                    word_count += 1;
111                } else if is_cjk(c) {
112                    word_count += 1;
113                }
114            }
115        }
116    }
117
118    let reading_time_minutes = (word_count / 200).max(1);
119
120    ReadingStats { word_count, char_count, reading_time_minutes }
121}
122
123fn is_cjk(c: char) -> bool {
124    matches!(c,
125        '\u{4E00}'..='\u{9FFF}' |
126        '\u{3400}'..='\u{4DBF}' |
127        '\u{20000}'..='\u{2A6DF}'
128    )
129}

CSV 解析

1#[wasm_bindgen]
2pub struct CsvData {
3    headers: Vec<String>,
4    rows: Vec<Vec<String>>,
5}
6
7#[wasm_bindgen]
8impl CsvData {
9    pub fn get_headers(&self) -> JsValue {
10        serde_wasm_bindgen::to_value(&self.headers).unwrap()
11    }
12
13    pub fn get_rows(&self) -> JsValue {
14        serde_wasm_bindgen::to_value(&self.rows).unwrap()
15    }
16
17    pub fn row_count(&self) -> usize {
18        self.rows.len()
19    }
20}
21
22#[wasm_bindgen]
23pub fn parse_csv(csv_text: &str, delimiter: char) -> Result<CsvData, JsValue> {
24    let mut headers = Vec::new();
25    let mut rows = Vec::new();
26    let mut is_first_row = true;
27
28    for line in csv_text.lines() {
29        if line.trim().is_empty() {
30            continue;
31        }
32
33        let fields = parse_csv_line(line, delimiter);
34
35        if is_first_row {
36            headers = fields;
37            is_first_row = false;
38        } else {
39            if fields.len() != headers.len() {
40                return Err(JsValue::from_str(&format!(
41                    "字段数量不对: 应该 {}, 实际 {}",
42                    headers.len(),
43                    fields.len()
44                )));
45            }
46            rows.push(fields);
47        }
48    }
49
50    Ok(CsvData { headers, rows })
51}
52
53fn parse_csv_line(line: &str, delimiter: char) -> Vec<String> {
54    let mut fields = Vec::new();
55    let mut current_field = String::new();
56    let mut in_quotes = false;
57    let mut chars = line.chars().peekable();
58
59    while let Some(c) = chars.next() {
60        match c {
61            '"' => {
62                if in_quotes && chars.peek() == Some(&'"') {
63                    current_field.push('"');
64                    chars.next();
65                } else {
66                    in_quotes = !in_quotes;
67                }
68            },
69            c if c == delimiter && !in_quotes => {
70                fields.push(current_field.trim().to_string());
71                current_field.clear();
72            },
73            _ => {
74                current_field.push(c);
75            }
76        }
77    }
78
79    fields.push(current_field.trim().to_string());
80    fields
81}

JSON 操作

1use serde_json::Value;
2
3#[wasm_bindgen]
4pub fn format_json(json_str: &str, indent: usize) -> Result<String, JsValue> {
5    let value: Value = serde_json::from_str(json_str)
6        .map_err(|e| JsValue::from_str(&e.to_string()))?;
7
8    let formatter = serde_json::ser::PrettyFormatter::with_indent(&vec![b' '; indent]);
9    let mut buf = Vec::new();
10    let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter);
11
12    value.serialize(&mut ser)
13        .map_err(|e| JsValue::from_str(&e.to_string()))?;
14
15    String::from_utf8(buf)
16        .map_err(|e| JsValue::from_str(&e.to_string()))
17}
18
19#[wasm_bindgen]
20pub fn minify_json(json_str: &str) -> Result<String, JsValue> {
21    let value: Value = serde_json::from_str(json_str)
22        .map_err(|e| JsValue::from_str(&e.to_string()))?;
23
24    serde_json::to_string(&value)
25        .map_err(|e| JsValue::from_str(&e.to_string()))
26}

JavaScript 集成

Markdown 实时预览

1import init, {
2  markdown_to_html,
3  extract_headings,
4  calculate_reading_stats,
5} from "./pkg/file_parser.js";
6
7await init();
8
9const editor = document.getElementById("editor");
10const preview = document.getElementById("preview");
11const toc = document.getElementById("toc");
12
13// 防抖
14function debounce(func, wait) {
15  let timeout;
16  return function () {
17    clearTimeout(timeout);
18    timeout = setTimeout(() => func.apply(this, arguments), wait);
19  };
20}
21
22editor.addEventListener(
23  "input",
24  debounce(() => {
25    const markdown = editor.value;
26
27    // 转 HTML
28    const html = markdown_to_html(markdown);
29    preview.innerHTML = html;
30
31    // 生成目录
32    const headings = extract_headings(markdown);
33    toc.innerHTML = headings
34      .map(
35        (h) => `
36        <div style="margin-left: ${(h.level - 1) * 20}px">
37            ${h.text}
38        </div>
39    `
40      )
41      .join("");
42
43    // 统计信息
44    const stats = calculate_reading_stats(markdown);
45    document.getElementById("stats").innerHTML = `
46        字数: ${stats.word_count} | 
47        字符: ${stats.char_count} | 
48        阅读时间: ${stats.reading_time_minutes} 分钟
49    `;
50  }, 300)
51);

CSV 数据导入

1import { parse_csv } from "./pkg/file_parser.js";
2
3async function handleCsvFile(file) {
4  const text = await file.text();
5
6  const start = performance.now();
7  const csvData = parse_csv(text, ",");
8  const duration = performance.now() - start;
9
10  console.log(`解析完成 (${duration.toFixed(2)} ms)`);
11  console.log(`行数: ${csvData.row_count()}`);
12
13  const headers = csvData.get_headers();
14  const rows = csvData.get_rows();
15
16  renderTable(headers, rows);
17}
18
19function renderTable(headers, rows) {
20  const table = document.createElement("table");
21
22  // 表头
23  const thead = table.createTHead();
24  const headerRow = thead.insertRow();
25  headers.forEach((h) => {
26    const th = document.createElement("th");
27    th.textContent = h;
28    headerRow.appendChild(th);
29  });
30
31  // 数据
32  const tbody = table.createTBody();
33  rows.slice(0, 100).forEach((row) => {
34    const tr = tbody.insertRow();
35    row.forEach((cell) => {
36      const td = tr.insertCell();
37      td.textContent = cell;
38    });
39  });
40
41  document.getElementById("table-container").innerHTML = "";
42  document.getElementById("table-container").appendChild(table);
43}

JSON 工具

1import { format_json, minify_json } from "./pkg/file_parser.js";
2
3// 格式化
4document.getElementById("format").addEventListener("click", () => {
5  const input = document.getElementById("json-input").value;
6
7  try {
8    const formatted = format_json(input, 2);
9    document.getElementById("json-output").value = formatted;
10  } catch (e) {
11    alert("JSON 格式错误: " + e);
12  }
13});
14
15// 压缩
16document.getElementById("minify").addEventListener("click", () => {
17  const input = document.getElementById("json-input").value;
18
19  try {
20    const minified = minify_json(input);
21    document.getElementById("json-output").value = minified;
22
23    const saved = (1 - minified.length / input.length) * 100;
24    alert(`节省了 ${saved.toFixed(1)}% 空间`);
25  } catch (e) {
26    alert("JSON 格式错误: " + e);
27  }
28});

注意事项

大文件处理

分块读取:

1async function parseHugeFile(file) {
2  const CHUNK_SIZE = 1024 * 1024; // 1MB
3  let offset = 0;
4
5  while (offset < file.size) {
6    const chunk = file.slice(offset, offset + CHUNK_SIZE);
7    const text = await chunk.text();
8
9    processChunk(text);
10
11    offset += CHUNK_SIZE;
12    await new Promise((r) => setTimeout(r, 0));
13  }
14}

Worker 线程

避免阻塞主线程:

1const worker = new Worker("parser-worker.js");
2
3worker.postMessage({ type: "parse_csv", data: csvText });
4
5worker.onmessage = (e) => {
6  const result = e.data;
7  renderTable(result);
8};

性能优化

使用成熟的库:

1// pulldown-cmark 比手写快很多
2use pulldown_cmark::{Parser, html};
3
4pub fn markdown_to_html(md: &str) -> String {
5    let mut html = String::new();
6    html::push_html(&mut html, Parser::new(md));
7    html
8}

实际应用

Markdown 编辑器

1class MarkdownEditor {
2  constructor(textarea, preview) {
3    this.textarea = textarea;
4    this.preview = preview;
5
6    this.textarea.addEventListener(
7      "input",
8      debounce(() => this.render(), 300)
9    );
10  }
11
12  render() {
13    const markdown = this.textarea.value;
14    const html = markdown_to_html(markdown);
15    this.preview.innerHTML = html;
16
17    // 同步滚动
18    this.syncScroll();
19  }
20
21  syncScroll() {
22    const scrollPercent =
23      this.textarea.scrollTop /
24      (this.textarea.scrollHeight - this.textarea.clientHeight);
25
26    this.preview.scrollTop =
27      scrollPercent * (this.preview.scrollHeight - this.preview.clientHeight);
28  }
29}

CSV 数据分析

1async function analyzeCSV(file) {
2  const text = await file.text();
3  const data = parse_csv(text, ",");
4
5  const headers = data.get_headers();
6  const rows = data.get_rows();
7
8  // 找数值列
9  const numericColumns = headers.filter((h, i) => {
10    return rows.every((row) => !isNaN(parseFloat(row[i])));
11  });
12
13  console.log("数值列:", numericColumns);
14
15  // 可以进一步做统计分析
16  for (const col of numericColumns) {
17    const colIndex = headers.indexOf(col);
18    const values = rows.map((row) => parseFloat(row[colIndex]));
19
20    const stats = calculate_statistics(new Float64Array(values));
21    console.log(`${col}: 均值 ${stats.mean.toFixed(2)}`);
22  }
23}

什么时候用

适合:

  • 大文件 (>1MB)
  • 复杂格式 (Markdown, CSV)
  • 实时预览
  • 批量处理

不适合:

  • 小文件
  • 简单 JSON (用 JSON.parse)
  • 一次性解析