feldera/python/felderize/spark/docs.py at felderize · feldera/feldera

History

312 lines (247 loc) · 11.2 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

from __future__ import annotations

import re

from pathlib import Path

import yaml

# Map each category to its doc file.

_DOC_FILES: dict[str, str] = {

"types": "types.md",

"string": "string.md",

"datetime": "datetime.md",

"json": "json.md",

"aggregates": "aggregates.md",

"array": "array.md",

"map": "map.md",

"decimal": "decimal.md",

"float": "float.md",

"casts": "casts.md",

"comparisons": "comparisons.md",

}

# SQL construct patterns that cannot be derived from the function index

# (keywords, operators, syntax forms rather than named functions).

# Keep these specific — broad patterns like \bDATE\b match almost every query.

_EXTRA_PATTERNS: dict[str, list[str]] = {

"datetime": [r"\bINTERVAL\b"], # DATE/TIMESTAMP covered by index function names

"aggregates": [r"\bGROUP\s+BY\b", r"\bHAVING\b", r"\bOVER\s*\("],

"array": [r"\bEXPLODE\b", r"\bUNNEST\b", r"\bsize\s*\("],

"map": [r"\bMAP\s*<"], # MAP( covered by index; MAP< is type syntax

"json": [r"\bVARIANT\b"], # JSON covered by index function names

"casts": [r"::"], # CAST covered by index; :: is operator syntax

"comparisons": [r"\bCASE\s+WHEN\b"],

}

# Spark function names that appear in SQL but are not in the Feldera index.

_SPARK_ALIASES: dict[str, list[str]] = {

"json": [r"\bget_json_object\b", r"\bfrom_json\b", r"\bjson_tuple\b"],

"array": [r"\barray_contains\b", r"\bsort_array\b", r"\barray_distinct\b"],

"decimal": [r"\bNUMERIC\b"],

"float": [r"\bFLOAT\b"],

}

# Regex to find HTML anchor IDs embedded in doc files: <a id="name">

_ANCHOR_ID_RE = re.compile(r'<a\s+id="([^"]+)"', re.IGNORECASE)

# Regex to detect function calls in SQL: FUNC_NAME(

_SQL_FUNC_RE = re.compile(r"\b([A-Z_][A-Z_0-9]*)\s*\(", re.IGNORECASE)

def _build_categories_from_index(

index_path: Path,

) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, str]]]]:

"""Parse function-index.md.

Returns:

cats: category → [\\bFUNC\\b, ...] trigger patterns

func_anchors: FUNC_NAME_UPPER → [(doc_filename, anchor_id), ...]

"""

known = set(_DOC_FILES) - {"types"}

cats: dict[str, list[str]] = {cat: [] for cat in _DOC_FILES}

func_anchors: dict[str, list[tuple[str, str]]] = {}

if not index_path.is_file():

return cats, func_anchors

func_re = re.compile(r"^\* `([A-Z_][A-Z_0-9 ]*)`", re.IGNORECASE)

link_re = re.compile(r"\[([a-z]+)\]\(([^)#]+)(?:#([^)]+))?\)")

for line in index_path.read_text().splitlines():

m = func_re.match(line)

if not m:

continue

func_name = m.group(1).strip()

func_upper = func_name.upper()

for link_m in link_re.finditer(line):

cat = link_m.group(1)

doc_file = link_m.group(2) # e.g. "string.md"

anchor = link_m.group(3) # e.g. "upper" (may be None)

if cat in known:

keyword = rf"\b{re.escape(func_name)}\b"

if keyword not in cats[cat]:

cats[cat].append(keyword)

if anchor:

func_anchors.setdefault(func_upper, []).append((doc_file, anchor))

return cats, func_anchors

_DEFAULT_DOCS_DIR = (

Path(__file__).resolve().parents[3] / "docs.feldera.com" / "docs" / "sql"

)

# Cache: docs_dir → (categories, func_anchors)

_cats_cache: dict[

Path, tuple[dict[str, list[str]], dict[str, list[tuple[str, str]]]]

] = {}

def _get_cats_and_anchors(

docs_dir: Path,

) -> tuple[dict[str, list[str]], dict[str, list[tuple[str, str]]]]:

"""Return (categories, func_anchors) for the given docs_dir, cached per path."""

if docs_dir not in _cats_cache:

cats, func_anchors = _build_categories_from_index(

docs_dir / "function-index.md"

)

for source in (_EXTRA_PATTERNS, _SPARK_ALIASES):

for cat, patterns in source.items():

seen = set(cats.get(cat, []))

for p in patterns:

if p not in seen:

cats.setdefault(cat, []).append(p)

seen.add(p)

_cats_cache[docs_dir] = (cats, func_anchors)

return _cats_cache[docs_dir]

# Module-level categories for load_examples() (which has no docs_dir).

# Built from the default docs location; func_anchors not needed for examples.

_CATEGORIES, _ = _get_cats_and_anchors(_DEFAULT_DOCS_DIR)

# ── Section-level doc parsing ────────────────────────────────────────────────

# Cache: filepath → (preamble, {heading: content}, {anchor_id: heading})

_section_cache: dict[Path, tuple[str, dict[str, str], dict[str, str]]] = {}

def _parse_doc_sections(

content: str,

) -> tuple[str, dict[str, str], dict[str, str]]:

"""Split a doc file into (preamble, sections, anchor_map).

preamble — text before the first ## heading

sections — ordered dict: ## heading text → section content (includes heading line)

anchor_map — <a id="x"> → ## heading text for every anchor in the file

"""

sections: dict[str, str] = {}

anchor_map: dict[str, str] = {}

preamble_lines: list[str] = []

current_heading: str | None = None

current_lines: list[str] = []

for line in content.splitlines(keepends=True):

if line.startswith("## "):

if current_heading is not None:

body = "".join(current_lines)

sections[current_heading] = body

for am in _ANCHOR_ID_RE.finditer(body):

anchor_map[am.group(1)] = current_heading

else:

preamble_lines = current_lines[:]

current_heading = line.rstrip()

current_lines = [line]

else:

current_lines.append(line)

if current_heading is not None:

body = "".join(current_lines)

sections[current_heading] = body

for am in _ANCHOR_ID_RE.finditer(body):

anchor_map[am.group(1)] = current_heading

elif current_lines:

preamble_lines = current_lines

return "".join(preamble_lines), sections, anchor_map

def _get_doc_sections(

doc_path: Path,

) -> tuple[str, dict[str, str], dict[str, str]]:

"""Return parsed sections for a doc file (cached)."""

if doc_path not in _section_cache:

if doc_path.is_file():

_section_cache[doc_path] = _parse_doc_sections(doc_path.read_text())

else:

_section_cache[doc_path] = ("", {}, {})

return _section_cache[doc_path]

def _load_relevant_sections(doc_path: Path, relevant_anchors: set[str]) -> str:

"""Return preamble + only the ## sections that contain a relevant anchor.

Falls back to the full file content when no anchor information is available

(e.g., the file has no <a id> tags) so that we never return empty docs for

a matched category.

"""

preamble, sections, anchor_map = _get_doc_sections(doc_path)

if not sections:

# Plain file with no ## headings — return as-is.

return preamble

# Determine which headings are needed.

needed: set[str] = set()

for anchor in relevant_anchors:

if anchor in anchor_map:

needed.add(anchor_map[anchor])

if not needed:

# No specific functions detected or none matched → include everything.

return preamble + "".join(sections.values())

parts = [preamble] if preamble.strip() else []

for heading, body in sections.items():

if heading in needed:

parts.append(body)

return "".join(parts)

# ── Category detection ───────────────────────────────────────────────────────

def _detect_categories(

sql: str,

cats: dict[str, list[str]] | None = None,

) -> set[str]:

"""Return set of category names whose trigger patterns match the SQL."""

matched = {"types"} # Always include types

for category, patterns in (cats if cats is not None else _CATEGORIES).items():

if not patterns:

continue

for pattern in patterns:

if re.search(pattern, sql, re.IGNORECASE):

matched.add(category)

break

return matched

def _detect_sql_functions(sql: str) -> set[str]:

"""Return uppercase names of all function calls found in the SQL."""

return {m.group(1).upper() for m in _SQL_FUNC_RE.finditer(sql)}

# ── Public API ───────────────────────────────────────────────────────────────

def load_docs(sql: str, docs_dir: Path | None = None) -> str:

"""Load relevant Feldera doc sections based on SQL content.

Only sections whose <a id> anchors correspond to functions actually present

in the SQL are included. Falls back to full file content for categories

matched by keyword patterns (e.g., GROUP BY) with no specific function match.

"""

if docs_dir is None:

docs_dir = _DEFAULT_DOCS_DIR

if not docs_dir.is_dir():

return ""

cats, func_anchors = _get_cats_and_anchors(docs_dir)

categories = _detect_categories(sql, cats)

sql_funcs = _detect_sql_functions(sql)

result_sections: list[str] = []

for category in sorted(categories):

if category not in _DOC_FILES:

continue

doc_filename = _DOC_FILES[category]

doc_path = docs_dir / doc_filename

# Collect anchors for functions in this doc file that appear in the SQL.

relevant_anchors: set[str] = set()

for func in sql_funcs:

for fname, anchor in func_anchors.get(func, []):

if fname == doc_filename:

relevant_anchors.add(anchor)

content = _load_relevant_sections(doc_path, relevant_anchors)

if content.strip():

result_sections.append(f"### {category}\n\n{content}")

return "\n\n---\n\n".join(result_sections)

_example_cache: dict[Path, tuple[set[str], str]] = {}

def load_examples(sql: str, examples_dir: Path | None = None) -> str:

"""Return validated translation examples relevant to the SQL input."""

if examples_dir is None:

examples_dir = Path(__file__).resolve().parent / "data" / "samples"

if not examples_dir.is_dir():

return ""

categories = _detect_categories(sql)

sections: list[str] = []

for filepath in sorted(examples_dir.glob("*.md")):

if filepath not in _example_cache:

raw = filepath.read_text()

cats: set[str] = set()

body = raw

if raw.startswith("---"):

parts = raw.split("---", 2)

if len(parts) >= 3:

try:

meta = yaml.safe_load(parts[1])

if isinstance(meta, dict):

cats = set(meta.get("categories", []))

except yaml.YAMLError:

pass

body = parts[2].strip()

_example_cache[filepath] = (cats, body)

cats, body = _example_cache[filepath]

if cats & categories:

sections.append(body)

return "\n\n---\n\n".join(sections)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

docs.py

Latest commit

History

docs.py

File metadata and controls