From 9463cda98b38b37cfb5bd5ee1ee5123221f046fb Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Sun, 28 Apr 2024 02:37:49 +0200 Subject: Add blogpost about Grab --- src/blog/grab/answer.sh | 2 + src/blog/grab/example-1.sh | 16 +++ src/blog/grab/example-2.sh | 4 + src/blog/grab/example-3.sh | 4 + src/blog/grab/example.vue | 7 ++ src/blog/grab/g.pat | 1 + src/blog/grab/grep.sh | 1 + src/blog/grab/index.gsp | 275 +++++++++++++++++++++++++++++++++++++++++++++ src/blog/grab/x.pat | 1 + src/blog/index.gsp | 10 +- 10 files changed, 316 insertions(+), 5 deletions(-) create mode 100644 src/blog/grab/answer.sh create mode 100644 src/blog/grab/example-1.sh create mode 100644 src/blog/grab/example-2.sh create mode 100644 src/blog/grab/example-3.sh create mode 100644 src/blog/grab/example.vue create mode 100644 src/blog/grab/g.pat create mode 100644 src/blog/grab/grep.sh create mode 100644 src/blog/grab/index.gsp create mode 100644 src/blog/grab/x.pat (limited to 'src/blog') diff --git a/src/blog/grab/answer.sh b/src/blog/grab/answer.sh new file mode 100644 index 0000000..117aea0 --- /dev/null +++ b/src/blog/grab/answer.sh @@ -0,0 +1,2 @@ +$ grab 'x// g/\bbrowser\b/ h//' foo +… diff --git a/src/blog/grab/example-1.sh b/src/blog/grab/example-1.sh new file mode 100644 index 0000000..97a4277 --- /dev/null +++ b/src/blog/grab/example-1.sh @@ -0,0 +1,16 @@ +$ cat foo +Hello world, this is +a paragraph. + +This is also a paragraph +but it contains doubled +doubled words. +$ grab 'x/[^\n].+?(?=\n\n|$)/ g/(\b\w+\b)\s+\1/' foo +This is also a paragraph +but it contains doubled +doubled words. +$ # Just like grep, you can display match positions +$ grab -f '…' foo +foo:4:1:This is also a paragraph +but it contains doubled +doubled words. diff --git a/src/blog/grab/example-2.sh b/src/blog/grab/example-2.sh new file mode 100644 index 0000000..883c580 --- /dev/null +++ b/src/blog/grab/example-2.sh @@ -0,0 +1,4 @@ +$ grab 'x/[^\n].+?(?=\n\n|$)/ g/(\b\w+\b)\s+\1/ h/(\b\w+\b)\s+\1/' foo +This is also a paragraph +but it contains + words. diff --git a/src/blog/grab/example-3.sh b/src/blog/grab/example-3.sh new file mode 100644 index 0000000..cfebed3 --- /dev/null +++ b/src/blog/grab/example-3.sh @@ -0,0 +1,4 @@ +$ grab 'x/[^\n].+?(?=\n\n|$)/ g/(\b\w+\b)\s+\1/ h//' foo +This is also a paragraph +but it contains + words. diff --git a/src/blog/grab/example.vue b/src/blog/grab/example.vue new file mode 100644 index 0000000..424efb2 --- /dev/null +++ b/src/blog/grab/example.vue @@ -0,0 +1,7 @@ + diff --git a/src/blog/grab/g.pat b/src/blog/grab/g.pat new file mode 100644 index 0000000..91e49b3 --- /dev/null +++ b/src/blog/grab/g.pat @@ -0,0 +1 @@ +x/[^\n].+?(?=\n\n|$)/ g/(\b\w+\b)\s+\1/ diff --git a/src/blog/grab/grep.sh b/src/blog/grab/grep.sh new file mode 100644 index 0000000..ecd3f3e --- /dev/null +++ b/src/blog/grab/grep.sh @@ -0,0 +1 @@ +$ git grep '' diff --git a/src/blog/grab/index.gsp b/src/blog/grab/index.gsp new file mode 100644 index 0000000..666c30f --- /dev/null +++ b/src/blog/grab/index.gsp @@ -0,0 +1,275 @@ +m4_define(HL, ‘‘m4_patsubst($1, ‘‘<\([^>]*\)>’’, ‘‘@span .hl-red {=\1}’’)’’)m4_dnl +html lang="en" { + head { HEAD } + body { + header { + div { + h1 {-Reinvent The Wheel!} + INCLUDE(nav.gsp) + } + + figure .quote { + blockquote { + p {= + You have to do what must be done. Nobody is going to ask you, “why + didn’t you make it?”. It’s either do it or not. Do not think about + what you’re feeling, do it no matter what. + } + } + figcaption {-Haroon Khan} + } + } + + main { + h2 #story {-Story of a Software Engineer} + p {- + It was your average Wednesday afternoon, and I was working my job. My + specific task on this day was quite simple: document our custom Vue + components that make up most of our products UI. + } + + p {- + This should be a relatively easy task and for the most part it was, but + I had an issue. Some of these components had some @em{-really} obscure + properties that could influence their behavior, and seeing as much of + the codebase was written 10 years ago by utter idiots, the code + implementing these properties is @em{-really} hard to read. + } + + p {- + I decided that it would be quite a bit easier to instead of trying to + study the @em{-definitions} of these properties, to try to study the + @em{-usage} of these properties. But how do I find them? Our codebase + is hundreds of thousands of lines of code, and these properties have + very generic names such as ‘@em{-browser}’. Additionally while the + components are easy to search for, they’re used in hundreds of places + and such properties may only be used once or twice. + } + + p {- + The solution? I thought it would be the trusty tool in every hackers + toolbelt: @code{-grep}. + } + + h2 #downfall {-The Downfall of Grep} + p {- + I thought that @code{-grep} would be my saviour. The tool that would + answer the call to find me the usages I so desired. So I whipped that + baby out and went straight to work: + } + + figure { + pre { FMT_CODE(grep.sh) } + } + + p {- + You can probably tell from the fact I’m writing this post that this did + not work. If you’ve ever worked with Vue or something similar, you + might even be able to figure out why. For those unfamiliar with the + frontend (you’re a treasure that must be preserved), allow me to show + you something that is all too common in a Vue codebase: + } + + figure { + pre .vue { FMT_CODE(example.vue) } + } + + p {- + The issue here is clear: the property we’re searching for (‘browser’) is + on an entirely different line from the component we’re searching for + (‘@code{-}’). It’s not enough to search for just the + component because it’s used everywhere but only a few rare usages + interest me, and it’s not enough to search for just the attribute + because many different components have attributes of the same name (and + @em{-no} they don’t have the same behavior; the codebase is shit). + } + + p {- + What I need is a tool that will let me search for patterns that span + multiple lines. + } + + h2 #grab {-Introducing Grab} + figure .quote { + blockquote { + p {= + The current UNIX® text processing tools are weakened by the built-in + concept of a line. There is a simple notation that can describe the + ‘shape’ of files when the typical array-of-lines picture is + inadequate. That notation is regular expressions. Using regular + expressions to describe the structure in addition to the contents of + files has interesting applications, and yields elegant methods for + dealing with some problems the current tools handle clumsily. When + operations using these expressions are composed, the result is + reminiscent of shell pipelines. + } + } + figcaption {-Rob Pike} + } + + p {- + That quote is from the abstract of @cite {-Structural Regular + Expressions}, a paper written by the one and only Rob Pike back in + 1987. It describes an idea by which we stop assuming that all data is + organized in lines, and instead use regular expressions to define the + shapes comprising our data. + } + + p {- + I actually had read this paper some years ago and it had always sat in + the back of my mind. I had actually toyed around in the past with an + implementation of @code{-grep} that wasn’t strictly line-oriented, but + it was very bare-bones, and lacked basic faculties such as reporting the + positions of matches, something I desperately needed. + } + + p {- + So over the following few days I made major changes, rewrote lots of the + code, and overall turned my tool — @code{-grab} — into a staple part of + my hackers toolbelt. + } + + h2 #how {-How Grab Finds Text} + p {- + If you’re familiar with the UNIX environment, you’re probably used to + querying text with tools such as @code{-sed} and @code{-awk} using + regular expressions. These are the same regular expressions we as + programmers all know and love, but with one important — yet often + overlooked — characteristic: you cannot match the newline. + } + + p {- + The @code{-grab} utility moves away from this limiting paradigm; the + newline is treated no differently from another other character you want + to match. Want to match an entire paragraph of text? The pattern is as + simple as ‘@code{-[^\\n].‌+?(?=\\n\\n|$)}’. It may look + complicated if you’re new to regular expressions — PCREs to be specific + — but it’s really quite simple. You just match a non-newline character, + and then as many characters as possible until reaching either a double + newline, or the end of input. + } + + p {- + On its own this isn’t too amazing though. The great thing of + @code{-grep} is that it doesn’t just show you matches, but it shows you + them in the context of a complete line. @code{-grab} solves this in the + same way described in Rob Pike’s paper: chaining operations. + } + + p {- + Say we want to iterate not over lines but over paragraphs. We can use + the following @em{-pattern}: + } + + figure { + pre { FMT_CODE(x.pat) } + } + + p {- + Here we’re using the ‘x’ operator. It iterates over all occurrences of + the pattern. In this case we’re iterating over all paragraphs in our + input. Maybe we want to see all paragraphs which contain doubled words + (for example: ‘the the’), a common typo found in text files. For this + we can use the ‘g’ operator: + } + + figure { + pre { FMT_CODE(g.pat) } + } + + p {- + The fundamental difference between the two operators is that the + ‘x’ operator specifies the structure to iterate over. In the context of + @code{-grep} these are lines, but in @code{-grab} they can be whatever + you want. The ‘g’ operator on the other hand doesn’t modify the + structure of the matches returned to you at all; it simply acts as a + filter selecting matches with match the given regular expression. + } + + p {- + Here’s an interactive example: + } + + figure { + pre { FMT_CODE(example-1.sh) } + } + + p {- + This is almost perfect; there’s just one bit missing. In my interactive + example I’ve shown how you can use the power of @code{-grab} to find + paragraphs in your files containing doubled words. This is really handy + if you find yourself writing websites, documentation, or other long-form + written content. + } + + p {- + Given my example though, how easily were you able to spot the doubled + words? It probably didn’t stick out to you right away, unlike if it had + been highlighted by some bright flashy color. It is for this reason + that the ‘h’ operator exists. This operator is unique in that it does + not change the given selections at all. Any matches made by previous + occurrences of ‘x' and ‘g’ will be displayed the same with and without + the use of ‘h’. + } + + p {- + The ‘h’ operators is purely for the user. By using this operator you + can specify a pattern for which matching text must be @em{-highlighted}. + Let’s apply it to the previous example and see how the doubled words are + made instantly obvious to the user: + } + + figure { + pre { HL(FMT_CODE(example-2.sh)) } + } + + p {- + There is an obvious problem here: the duplication of the regular + expression provided to the ‘g’ and ‘h’ operators. It is @em{-extremely} + common that you will want to highlight text that was just matched by a + ‘g’ operator. Like, @em{-really} common. So common in fact that the + ‘h’ operator supports a shorthand syntax for this exact situation: + @code {-h//}. Giving an empty regular expression as an argument to an + operator is illegal with the exception of the ‘h’ operator. When this + operator is given an empty argument, it assumes the regular expression + of the previous operator: + } + + figure { + pre { HL(FMT_CODE(example-3.sh)) } + } + + h2 #final {-Final Solution} + p {- + So… what was the final solution to my problem? How did I find all the + @code{-} tags in my jobs codebase that were passed the + ‘browser’ attribute? Well here’s how: + } + + figure { + pre { FMT_CODE(answer.sh) } + } + + p {- + Quick, simple, and elegant. Just the way I like it! + } + + h2 #more {-Additional Operators} + p {- + Here I’ve shown you the 3 main operators: ‘x’, ‘g’, and ‘h’. These are + not all however! Each operator also has a capital variant (‘X’, ‘G’, + ‘H’) which behaves the same but instead of working on text that matches + the given pattern, these operators match on text which @em{-doesn’t} + match the given pattern. + } + + p {- + These operators allow for better pattern matching. For example a + pattern to match all numbers which contain a ‘3’ but which aren’t ‘1337’ + could be written as @code{-x/[0-9]+/ g/3/ G/^1337$/}. + } + } + + footer { FOOT } + } +} diff --git a/src/blog/grab/x.pat b/src/blog/grab/x.pat new file mode 100644 index 0000000..d8793e7 --- /dev/null +++ b/src/blog/grab/x.pat @@ -0,0 +1 @@ +x/[^\n].+?(?=\n\n|$)/ diff --git a/src/blog/index.gsp b/src/blog/index.gsp index 7e53697..e869c7e 100644 --- a/src/blog/index.gsp +++ b/src/blog/index.gsp @@ -28,12 +28,12 @@ html lang="en" { p {-Posts:} ul { - m4_dnl ARTICLE(grab, {-Making Grep Better}) - m4_dnl ARTICLE(andy-val, {-Values in Andy}) - m4_dnl ARTICLE(new-sh, {-Making a New Shell}) - m4_dnl ARTICLE(extend, {-Extensible Scripting}) - m4_dnl ARTICLE(fw-ec, {-Patching My Laptop’s Embedded Controller}) + / ARTICLE(andy-val, {-Values in Andy}) + / ARTICLE(new-sh, {-Making a New Shell}) + / ARTICLE(extend, {-Extensible Scripting}) + / ARTICLE(fw-ec, {-Patching My Laptop’s Embedded Controller}) + ARTICLE(grab, {-Making Grep Better}) ARTICLE(gsp, {-Writing an HTML Preprocessor (feat. Tree-Sitter)}) ARTICLE(nvim-ts, {-Hacking with Tree-Sitter on Neovim}) } -- cgit v1.2.3