From 7a513031822010612ef1fd0104cb339854d741d3 Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Wed, 24 Jan 2024 21:53:27 +0100
Subject: Add new blog post

---
 src/blog/grab/answer.sh.gsp    |   2 +
 src/blog/grab/example-1.sh.gsp |  16 +++
 src/blog/grab/example-2.sh.gsp |   4 +
 src/blog/grab/example-3.sh.gsp |   4 +
 src/blog/grab/example.vue.gsp  |   7 ++
 src/blog/grab/g.pat.gsp        |   1 +
 src/blog/grab/grep.sh.gsp      |   1 +
 src/blog/grab/index.gsp        | 276 +++++++++++++++++++++++++++++++++++++++++
 src/blog/grab/x.pat.gsp        |   1 +
 src/blog/index.gsp             |   1 +
 src/style.css                  |  17 ++-
 11 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 src/blog/grab/answer.sh.gsp
 create mode 100644 src/blog/grab/example-1.sh.gsp
 create mode 100644 src/blog/grab/example-2.sh.gsp
 create mode 100644 src/blog/grab/example-3.sh.gsp
 create mode 100644 src/blog/grab/example.vue.gsp
 create mode 100644 src/blog/grab/g.pat.gsp
 create mode 100644 src/blog/grab/grep.sh.gsp
 create mode 100644 src/blog/grab/index.gsp
 create mode 100644 src/blog/grab/x.pat.gsp

diff --git a/src/blog/grab/answer.sh.gsp b/src/blog/grab/answer.sh.gsp
new file mode 100644
index 0000000..1f634d7
--- /dev/null
+++ b/src/blog/grab/answer.sh.gsp
@@ -0,0 +1,2 @@
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/<date-input.*?>/ g/‌\\bbrowser\\b/ h//'} foo
+…
diff --git a/src/blog/grab/example-1.sh.gsp b/src/blog/grab/example-1.sh.gsp
new file mode 100644
index 0000000..9944003
--- /dev/null
+++ b/src/blog/grab/example-1.sh.gsp
@@ -0,0 +1,16 @@
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-cat} foo
+Hello world, this is
+a paragraph.
+
+This is also a paragraph
+but it contains doubled
+doubled words.
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/'} foo
+This is also a paragraph
+but it contains doubled
+doubled words.
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .cmt {-# Just like grep, you can display match positions}
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} -f @span .str {-'…'} foo
+@span .grab-file {-foo}@span .grab-sep {-:}@span .grab-pos {-4}@span .grab-sep {-:}@span .grab-pos {-1}@span .grab-sep {-:}This is also a paragraph
+but it contains doubled
+doubled words.
diff --git a/src/blog/grab/example-2.sh.gsp b/src/blog/grab/example-2.sh.gsp
new file mode 100644
index 0000000..5e55a4d
--- /dev/null
+++ b/src/blog/grab/example-2.sh.gsp
@@ -0,0 +1,4 @@
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/ h/(\\b\\w+\\b)\\s+\\1/'} foo
+This is also a paragraph
+but it contains @span .grab-hl {-doubled}
+@span .grab-hl {-doubled} words.
diff --git a/src/blog/grab/example-3.sh.gsp b/src/blog/grab/example-3.sh.gsp
new file mode 100644
index 0000000..d6076b9
--- /dev/null
+++ b/src/blog/grab/example-3.sh.gsp
@@ -0,0 +1,4 @@
+@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/ h//'} foo
+This is also a paragraph
+but it contains @span .grab-hl {-doubled}
+@span .grab-hl {-doubled} words.
diff --git a/src/blog/grab/example.vue.gsp b/src/blog/grab/example.vue.gsp
new file mode 100644
index 0000000..58975c6
--- /dev/null
+++ b/src/blog/grab/example.vue.gsp
@@ -0,0 +1,7 @@
+<@span .fn {-date-input}
+	@span .var {-v-model}@span .op {-=}@span .str {-"date"}
+	@span .var {-class}@span .op {-=}@span .str {-"foo bar"}
+	@span .var {-:browser}@span .op {-=}@span .str {-"true"}
+	@span .var {-:placeholder}@span .op {-=}@span .str {-"today"}
+	@span .var {-required}
+/>
diff --git a/src/blog/grab/g.pat.gsp b/src/blog/grab/g.pat.gsp
new file mode 100644
index 0000000..cebea99
--- /dev/null
+++ b/src/blog/grab/g.pat.gsp
@@ -0,0 +1 @@
+@span .fn {-x}@span .op {-/}[^\\n].‌+?(?=\\n\\n|$)@span .op {-/} @span .fn {-g}@span .op {-/}(\\b\\w+\\b)\\s+\\1@span .op {-/}
diff --git a/src/blog/grab/grep.sh.gsp b/src/blog/grab/grep.sh.gsp
new file mode 100644
index 0000000..1f363a7
--- /dev/null
+++ b/src/blog/grab/grep.sh.gsp
@@ -0,0 +1 @@
+$ @span .fn {-git} grep @span .str {-'<date-input.*browser.*‌>'}
diff --git a/src/blog/grab/index.gsp b/src/blog/grab/index.gsp
new file mode 100644
index 0000000..b9d0d84
--- /dev/null
+++ b/src/blog/grab/index.gsp
@@ -0,0 +1,276 @@
+html lang="en" {
+	head { m4_include(head.gsp) }
+	body {
+		header {
+			div {
+				h1 {-Reinvent The Wheel!}
+				m4_include(nav.gsp)
+			}
+
+			figure .quote {
+				blockquote {
+					p {=
+						You have to do what must be done.  Nobody is going to ask you, “why
+						didn’t you make it?”.  It’s either do it or not.  Do not think about
+						what you’re feeling, do it no matter what.
+					}
+				}
+				figcaption {-Haroon Khan}
+			}
+		}
+
+		main {
+			h2 #story {-Story of a Software Engineer}
+			p {-
+				It was your average Wednesday afternoon, and I was working my job.  My
+				specific task on this day was quite simple: document our custom Vue
+				components that make up most of our products m4_abbr(UI).
+			}
+
+			p {-
+				This should be a relatively easy task and for the most part it was, but
+				I had an issue.  Some of these components had some @em{-really} obscure
+				properties that could influence their behavior, and seeing as much of
+				the codebase was written 10 years ago by utter idiots, the code
+				implementing these properties is @em{-really} hard to read.
+			}
+
+			p {-
+				I decided that it would be quite a bit easier to instead of trying to
+				study the @em{-definitions} of these properties, to try to study the
+				@em{-usage} of these properties.  But how do I find them?  Our codebase
+				is hundreds of thousands of lines of code, and these properties have
+				very generic names such as ‘@em{-browser}’.  Additionally while the
+				components are easy to search for, they’re used in hundreds of places
+				and such properties may only be used once or twice.
+			}
+
+			p {-
+				The solution?  I thought it would be the trusty tool in every hackers
+				toolbelt: @code{-grep}.
+			}
+
+			h2 #downfall {-The Downfall of Grep}
+			p {-
+				I thought that @code{-grep} would be my saviour.  The tool that would
+				answer the call to find me the usages I so desired.  So I whipped that
+				baby out and went straight to work:
+			}
+
+			figure {
+				pre {= m4_fmt_code(grep.sh.gsp) }
+			}
+
+			p {-
+				You can probably tell from the fact I’m writing this post that this did
+				not work.  If you’ve ever worked with Vue or something similar, you
+				might even be able to figure out why.  For those unfamiliar with the
+				frontend (you’re a treasure that must be preserved), allow me to show
+				you something that is all too common in a Vue codebase:
+			}
+
+			figure {
+				pre .vue {= m4_fmt_code(example.vue.gsp) }
+			}
+
+			p {-
+				The issue here is clear: the property we’re searching for (‘browser’) is
+				on an entirely different line from the component we’re searching for
+				(‘@code{-<date-input>}’).  It’s not enough to search for just the
+				component because it’s used everywhere but only a few rare usages
+				interest me, and it’s not enough to search for just the attribute
+				because many different components have attributes of the same name (and
+				@em{-no} they don’t have the same behavior; the codebase is shit).
+			}
+
+			p {-
+				What I need is a tool that will let me search for patterns that span
+				multiple lines.
+			}
+
+			h2 #grab {-Introducing Grab}
+			figure .quote {
+				blockquote {
+					p {=
+						The current UNIX® text processing tools are weakened by the built-in
+						concept of a line.  There is a simple notation that can describe the
+						‘shape’ of files when the typical array-of-lines picture is
+						inadequate.  That notation is regular expressions.  Using regular
+						expressions to describe the structure in addition to the contents of
+						files has interesting applications, and yields elegant methods for
+						dealing with some problems the current tools handle clumsily.  When
+						operations using these expressions are composed, the result is
+						reminiscent of shell pipelines.
+					}
+				}
+				figcaption {-Rob Pike}
+			}
+
+			p {-
+				That quote is from the abstract of @cite {-Structural Regular
+					Expressions}, a paper written by the one and only Rob Pike back in
+				1987.  It describes an idea by which we stop assuming that all data is
+				organized in lines, and instead use regular expressions to define the
+				shapes comprising our data.
+			}
+
+			p {-
+				I actually had read this paper some years ago and it had always sat in
+				the back of my mind.  I had actually toyed around in the past with an
+				implementation of @code{-grep} that wasn’t strictly line-oriented, but
+				it was very bare-bones, and lacked basic faculties such as reporting the
+				positions of matches, something I desperately needed.
+			}
+
+			p {-
+				So over the following few days I made major changes, rewrote lots of the
+				code, and overall turned my tool — @code{-grab} — into a staple part of
+				my hackers toolbelt.
+			}
+
+			h2 #how {-How Grab Finds Text}
+			p {-
+				If you’re familiar with the UNIX environment, you’re probably used to
+				querying text with tools such as @code{-sed} and @code{-awk} using
+				regular expressions.  These are the same regular expressions we as
+				programmers all know and love, but with one important — yet often
+				overlooked — characteristic: you cannot match the newline.
+			}
+
+			p {-
+				The @code{-grab} utility moves away from this limiting paradiagm; the
+				newline is treated no differently from another other character you want
+				to match.  Want to match an entire paragraph of text?  The pattern is as
+				simple as ‘@code{-[^\\n].‌+?(?=\\n\\n|$)}’.  It may look
+				complicated if you’re new to regular expressions — m4_abbr(PCRE)s to be
+				specific — but it’s really quite simple.  You just match a non-newline
+				character, and then as many characters as possible until reaching either
+				a double newline, or the end of input.
+			}
+
+			p {-
+				On its own this isn’t too amazing though.  The great thing of
+				@code{-grep} is that it doesn’t just show you matches, but it shows you
+				them in the context of a complete line.  @code{-grab} solves this in the
+				same way described in Rob Pike’s paper: chaining operations.
+			}
+
+			p {-
+				Say we want to iterate not over lines but over paragraphs.  We can use
+				the following @em{-pattern}:
+			}
+
+			figure {
+				pre {= m4_fmt_code(x.pat.gsp) }
+			}
+
+			p {-
+				Here we’re using the ‘x’ operator.  It iterates over all occurances of
+				the pattern.  In this case we’re iterating over all paragraphs in our
+				input.  Maybe we want to see all paragraphs which contain doubled words
+				(for example: ‘the the’), a common typo found in text files.  For this
+				we can use the ‘g’ operator:
+			}
+
+			figure {
+				pre {= m4_fmt_code(g.pat.gsp) }
+			}
+
+			p {-
+				The fundamental difference between the two operators is that the
+				‘x’ operator specifies the structure to iterate over.  In the context of
+				@code{-grep} these are lines, but in @code{-grab} they can be whatever
+				you want.  The ‘g’ operator on the otherhand doesn’t modify the
+				structure of the matches returned to you at all; it simply acts as a
+				filter selecting matches with match the given regular expression.
+			}
+
+			p {-
+				Here’s an interactive example:
+			}
+
+			figure {
+				pre {= m4_fmt_code(example-1.sh.gsp) }
+			}
+
+			p {-
+				This is almost perfect; there’s just one bit missing.  In my interactive
+				example I’ve shown how you can use the power of @code{-grab} to find
+				paragraphs in your files containing doubled words.  This is really handy
+				if you find yourself writing websites, documentation, or other long-form
+				written content.
+			}
+
+			p {-
+				Given my example though, how easily were you able to spot the doubled
+				words?  It probably didn’t stick out to you right away, unlike if it had
+				been highlighted by some bright flashy color.  It is for this reason
+				that the ‘h’ operator exists.  This operator is unique in that it does
+				not change the given selections at all.  Any matches made by previous
+				occurances of ‘x' and ‘g’ will be displayed the same with and without
+				the use of ‘h’.
+			}
+
+			p {-
+				The ‘h’ operators is purely for the user.  By using this operator you
+				can specify a pattern for which matching text must be @em{-highlighted}.
+				Let’s apply it to the previous example and see how the doubled words are
+				made instantly obvious to the user:
+			}
+
+			figure {
+				pre {= m4_fmt_code(example-2.sh.gsp) }
+			}
+
+			p {-
+				There is an obvious problem here: the duplication of the regular
+				expression provided to the ‘g’ and ‘h’ operators.  It is @em{-extremely}
+				common that you will want to highlight text that was just matched by a
+				‘g’ operator.  Like, @em{-really} common.  So common in fact that the
+				‘h’ operator supports a shorthand syntax for this exact situation:
+				@code {-h//}.  Giving an empty regular expression as an argument to an
+				operator is illegal with the exception of the ‘h’ operator.  When this
+				operator is given an empty argument, it assumes the regular expression
+				of the previous operator:
+			}
+
+			figure {
+				pre {= m4_fmt_code(example-3.sh.gsp) }
+			}
+
+			h2 #final {-Final Solution}
+			p {-
+				So… what was the final solution to my problem?  How did I find all the
+				@code{-<date-input>} tags in my jobs codebase that were passed the
+				‘browser’ attribute?  Well here’s how:
+			}
+
+			figure {
+				pre {= m4_fmt_code(answer.sh.gsp) }
+			}
+
+			p {-
+				Quick, simple, and elegant.  Just the way I like it!
+			}
+
+			h2 #more {-Additional Operators}
+			p {-
+				Here I’ve shown you the 3 main operators: ‘x’, ‘g’, and ‘h’.  These are
+				not all however!  Each operator also has a capital variant (‘X’, ‘G’,
+				‘H’) which behaves the same but instead of working on text that matches
+				the given pattern, these operators match on text which @em{-doesn’t}
+				match the given pattern.
+			}
+
+			p {-
+				These operators allow for better pattern matching.  For example a
+				pattern to match all numbers which contain a ‘3’ but which aren’t ‘1337’
+				could be written as @code{-x/[0-9]+/ g/3/ G/^1337$/}.
+			}
+		}
+
+		hr{}
+		
+		footer { m4_footer }
+	}
+}
diff --git a/src/blog/grab/x.pat.gsp b/src/blog/grab/x.pat.gsp
new file mode 100644
index 0000000..1ac0e34
--- /dev/null
+++ b/src/blog/grab/x.pat.gsp
@@ -0,0 +1 @@
+@span .fn {-x}@span .op {-/}[^\\n].‌+?(?=\\n\\n|$)@span .op {-/}
diff --git a/src/blog/index.gsp b/src/blog/index.gsp
index 0673da3..fce8d08 100644
--- a/src/blog/index.gsp
+++ b/src/blog/index.gsp
@@ -28,6 +28,7 @@ html lang="en" {
 			p {-Posts:}
 
 			ul {
+				m4_article(grab, {-Making Grep Better})
 				m4_article(andy-val, {-Values in Andy})
 				m4_article(new-sh, {-Making a New Shell})
 				m4_article(extend, {-Extensible Scripting})
diff --git a/src/style.css b/src/style.css
index 19a32ab..3941394 100644
--- a/src/style.css
+++ b/src/style.css
@@ -50,8 +50,15 @@ pre, code, kbd, samp {
      pre  > code { color: var(--fg);    }
 :not(pre) > code { white-space: nowrap; }
 
-pre.js, pre.sh   { tab-size: 4; }
-pre.pug, pre.gsp { tab-size: 2; }
+pre.js,
+pre.sh {
+	tab-size: 4;
+}
+pre.gsp,
+pre.pug,
+pre.vue {
+	tab-size: 2;
+}
 
 h1 {
 	font-size: 1.8rem;
@@ -200,6 +207,11 @@ dl {
 .gsp-val  { color: var(--aqua);   }
 .gsp-op   { color: var(--lesser); }
 
+.grab-file { color: #ECABF7; }
+.grab-hl   { color: #FF4D51; font-weight: bold; }
+.grab-pos  { color: #47B413; }
+.grab-sep  { color: #24DFC4; }
+
 .article {
 	display: flex;
 	justify-content: space-between;
@@ -264,6 +276,7 @@ dl {
 	abbr.led::before   { content: 'Light-Emitting Diode';                }
 	abbr.nas::before   { content: 'Network Attached Storage';            }
 	abbr.os::before    { content: 'Operating System';                    }
+	abbr.pcre::before  { content: 'Perl Compatible Regular Expression';  }
 	abbr.pdf::before   { content: 'Portable Document Format';            }
 	abbr.posix::before { content: 'Portable Operating System Interface'; }
 	abbr.qr::before    { content: 'Quick Response';                      }
-- 
cgit v1.2.3