From 7a513031822010612ef1fd0104cb339854d741d3 Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Wed, 24 Jan 2024 21:53:27 +0100 Subject: Add new blog post --- src/blog/grab/answer.sh.gsp | 2 + src/blog/grab/example-1.sh.gsp | 16 +++ src/blog/grab/example-2.sh.gsp | 4 + src/blog/grab/example-3.sh.gsp | 4 + src/blog/grab/example.vue.gsp | 7 ++ src/blog/grab/g.pat.gsp | 1 + src/blog/grab/grep.sh.gsp | 1 + src/blog/grab/index.gsp | 276 +++++++++++++++++++++++++++++++++++++++++ src/blog/grab/x.pat.gsp | 1 + src/blog/index.gsp | 1 + src/style.css | 17 ++- 11 files changed, 328 insertions(+), 2 deletions(-) create mode 100644 src/blog/grab/answer.sh.gsp create mode 100644 src/blog/grab/example-1.sh.gsp create mode 100644 src/blog/grab/example-2.sh.gsp create mode 100644 src/blog/grab/example-3.sh.gsp create mode 100644 src/blog/grab/example.vue.gsp create mode 100644 src/blog/grab/g.pat.gsp create mode 100644 src/blog/grab/grep.sh.gsp create mode 100644 src/blog/grab/index.gsp create mode 100644 src/blog/grab/x.pat.gsp diff --git a/src/blog/grab/answer.sh.gsp b/src/blog/grab/answer.sh.gsp new file mode 100644 index 0000000..1f634d7 --- /dev/null +++ b/src/blog/grab/answer.sh.gsp @@ -0,0 +1,2 @@ +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x// g/‌\\bbrowser\\b/ h//'} foo +… diff --git a/src/blog/grab/example-1.sh.gsp b/src/blog/grab/example-1.sh.gsp new file mode 100644 index 0000000..9944003 --- /dev/null +++ b/src/blog/grab/example-1.sh.gsp @@ -0,0 +1,16 @@ +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-cat} foo +Hello world, this is +a paragraph. + +This is also a paragraph +but it contains doubled +doubled words. +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/'} foo +This is also a paragraph +but it contains doubled +doubled words. +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .cmt {-# Just like grep, you can display match positions} +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} -f @span .str {-'…'} foo +@span .grab-file {-foo}@span .grab-sep {-:}@span .grab-pos {-4}@span .grab-sep {-:}@span .grab-pos {-1}@span .grab-sep {-:}This is also a paragraph +but it contains doubled +doubled words. diff --git a/src/blog/grab/example-2.sh.gsp b/src/blog/grab/example-2.sh.gsp new file mode 100644 index 0000000..5e55a4d --- /dev/null +++ b/src/blog/grab/example-2.sh.gsp @@ -0,0 +1,4 @@ +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/ h/(\\b\\w+\\b)\\s+\\1/'} foo +This is also a paragraph +but it contains @span .grab-hl {-doubled} +@span .grab-hl {-doubled} words. diff --git a/src/blog/grab/example-3.sh.gsp b/src/blog/grab/example-3.sh.gsp new file mode 100644 index 0000000..d6076b9 --- /dev/null +++ b/src/blog/grab/example-3.sh.gsp @@ -0,0 +1,4 @@ +@span .p {-thomas} @span .pp {-~} @span .p {-〉}@span .fn {-grab} @span .str {-'x/[^\\n].‌+?(?=\\n\\n|$)/ g/(\\b\\w+\\b)\\s+\\1/ h//'} foo +This is also a paragraph +but it contains @span .grab-hl {-doubled} +@span .grab-hl {-doubled} words. diff --git a/src/blog/grab/example.vue.gsp b/src/blog/grab/example.vue.gsp new file mode 100644 index 0000000..58975c6 --- /dev/null +++ b/src/blog/grab/example.vue.gsp @@ -0,0 +1,7 @@ +<@span .fn {-date-input} + @span .var {-v-model}@span .op {-=}@span .str {-"date"} + @span .var {-class}@span .op {-=}@span .str {-"foo bar"} + @span .var {-:browser}@span .op {-=}@span .str {-"true"} + @span .var {-:placeholder}@span .op {-=}@span .str {-"today"} + @span .var {-required} +/> diff --git a/src/blog/grab/g.pat.gsp b/src/blog/grab/g.pat.gsp new file mode 100644 index 0000000..cebea99 --- /dev/null +++ b/src/blog/grab/g.pat.gsp @@ -0,0 +1 @@ +@span .fn {-x}@span .op {-/}[^\\n].‌+?(?=\\n\\n|$)@span .op {-/} @span .fn {-g}@span .op {-/}(\\b\\w+\\b)\\s+\\1@span .op {-/} diff --git a/src/blog/grab/grep.sh.gsp b/src/blog/grab/grep.sh.gsp new file mode 100644 index 0000000..1f363a7 --- /dev/null +++ b/src/blog/grab/grep.sh.gsp @@ -0,0 +1 @@ +$ @span .fn {-git} grep @span .str {-''} diff --git a/src/blog/grab/index.gsp b/src/blog/grab/index.gsp new file mode 100644 index 0000000..b9d0d84 --- /dev/null +++ b/src/blog/grab/index.gsp @@ -0,0 +1,276 @@ +html lang="en" { + head { m4_include(head.gsp) } + body { + header { + div { + h1 {-Reinvent The Wheel!} + m4_include(nav.gsp) + } + + figure .quote { + blockquote { + p {= + You have to do what must be done. Nobody is going to ask you, “why + didn’t you make it?”. It’s either do it or not. Do not think about + what you’re feeling, do it no matter what. + } + } + figcaption {-Haroon Khan} + } + } + + main { + h2 #story {-Story of a Software Engineer} + p {- + It was your average Wednesday afternoon, and I was working my job. My + specific task on this day was quite simple: document our custom Vue + components that make up most of our products m4_abbr(UI). + } + + p {- + This should be a relatively easy task and for the most part it was, but + I had an issue. Some of these components had some @em{-really} obscure + properties that could influence their behavior, and seeing as much of + the codebase was written 10 years ago by utter idiots, the code + implementing these properties is @em{-really} hard to read. + } + + p {- + I decided that it would be quite a bit easier to instead of trying to + study the @em{-definitions} of these properties, to try to study the + @em{-usage} of these properties. But how do I find them? Our codebase + is hundreds of thousands of lines of code, and these properties have + very generic names such as ‘@em{-browser}’. Additionally while the + components are easy to search for, they’re used in hundreds of places + and such properties may only be used once or twice. + } + + p {- + The solution? I thought it would be the trusty tool in every hackers + toolbelt: @code{-grep}. + } + + h2 #downfall {-The Downfall of Grep} + p {- + I thought that @code{-grep} would be my saviour. The tool that would + answer the call to find me the usages I so desired. So I whipped that + baby out and went straight to work: + } + + figure { + pre {= m4_fmt_code(grep.sh.gsp) } + } + + p {- + You can probably tell from the fact I’m writing this post that this did + not work. If you’ve ever worked with Vue or something similar, you + might even be able to figure out why. For those unfamiliar with the + frontend (you’re a treasure that must be preserved), allow me to show + you something that is all too common in a Vue codebase: + } + + figure { + pre .vue {= m4_fmt_code(example.vue.gsp) } + } + + p {- + The issue here is clear: the property we’re searching for (‘browser’) is + on an entirely different line from the component we’re searching for + (‘@code{-}’). It’s not enough to search for just the + component because it’s used everywhere but only a few rare usages + interest me, and it’s not enough to search for just the attribute + because many different components have attributes of the same name (and + @em{-no} they don’t have the same behavior; the codebase is shit). + } + + p {- + What I need is a tool that will let me search for patterns that span + multiple lines. + } + + h2 #grab {-Introducing Grab} + figure .quote { + blockquote { + p {= + The current UNIX® text processing tools are weakened by the built-in + concept of a line. There is a simple notation that can describe the + ‘shape’ of files when the typical array-of-lines picture is + inadequate. That notation is regular expressions. Using regular + expressions to describe the structure in addition to the contents of + files has interesting applications, and yields elegant methods for + dealing with some problems the current tools handle clumsily. When + operations using these expressions are composed, the result is + reminiscent of shell pipelines. + } + } + figcaption {-Rob Pike} + } + + p {- + That quote is from the abstract of @cite {-Structural Regular + Expressions}, a paper written by the one and only Rob Pike back in + 1987. It describes an idea by which we stop assuming that all data is + organized in lines, and instead use regular expressions to define the + shapes comprising our data. + } + + p {- + I actually had read this paper some years ago and it had always sat in + the back of my mind. I had actually toyed around in the past with an + implementation of @code{-grep} that wasn’t strictly line-oriented, but + it was very bare-bones, and lacked basic faculties such as reporting the + positions of matches, something I desperately needed. + } + + p {- + So over the following few days I made major changes, rewrote lots of the + code, and overall turned my tool — @code{-grab} — into a staple part of + my hackers toolbelt. + } + + h2 #how {-How Grab Finds Text} + p {- + If you’re familiar with the UNIX environment, you’re probably used to + querying text with tools such as @code{-sed} and @code{-awk} using + regular expressions. These are the same regular expressions we as + programmers all know and love, but with one important — yet often + overlooked — characteristic: you cannot match the newline. + } + + p {- + The @code{-grab} utility moves away from this limiting paradiagm; the + newline is treated no differently from another other character you want + to match. Want to match an entire paragraph of text? The pattern is as + simple as ‘@code{-[^\\n].‌+?(?=\\n\\n|$)}’. It may look + complicated if you’re new to regular expressions — m4_abbr(PCRE)s to be + specific — but it’s really quite simple. You just match a non-newline + character, and then as many characters as possible until reaching either + a double newline, or the end of input. + } + + p {- + On its own this isn’t too amazing though. The great thing of + @code{-grep} is that it doesn’t just show you matches, but it shows you + them in the context of a complete line. @code{-grab} solves this in the + same way described in Rob Pike’s paper: chaining operations. + } + + p {- + Say we want to iterate not over lines but over paragraphs. We can use + the following @em{-pattern}: + } + + figure { + pre {= m4_fmt_code(x.pat.gsp) } + } + + p {- + Here we’re using the ‘x’ operator. It iterates over all occurances of + the pattern. In this case we’re iterating over all paragraphs in our + input. Maybe we want to see all paragraphs which contain doubled words + (for example: ‘the the’), a common typo found in text files. For this + we can use the ‘g’ operator: + } + + figure { + pre {= m4_fmt_code(g.pat.gsp) } + } + + p {- + The fundamental difference between the two operators is that the + ‘x’ operator specifies the structure to iterate over. In the context of + @code{-grep} these are lines, but in @code{-grab} they can be whatever + you want. The ‘g’ operator on the otherhand doesn’t modify the + structure of the matches returned to you at all; it simply acts as a + filter selecting matches with match the given regular expression. + } + + p {- + Here’s an interactive example: + } + + figure { + pre {= m4_fmt_code(example-1.sh.gsp) } + } + + p {- + This is almost perfect; there’s just one bit missing. In my interactive + example I’ve shown how you can use the power of @code{-grab} to find + paragraphs in your files containing doubled words. This is really handy + if you find yourself writing websites, documentation, or other long-form + written content. + } + + p {- + Given my example though, how easily were you able to spot the doubled + words? It probably didn’t stick out to you right away, unlike if it had + been highlighted by some bright flashy color. It is for this reason + that the ‘h’ operator exists. This operator is unique in that it does + not change the given selections at all. Any matches made by previous + occurances of ‘x' and ‘g’ will be displayed the same with and without + the use of ‘h’. + } + + p {- + The ‘h’ operators is purely for the user. By using this operator you + can specify a pattern for which matching text must be @em{-highlighted}. + Let’s apply it to the previous example and see how the doubled words are + made instantly obvious to the user: + } + + figure { + pre {= m4_fmt_code(example-2.sh.gsp) } + } + + p {- + There is an obvious problem here: the duplication of the regular + expression provided to the ‘g’ and ‘h’ operators. It is @em{-extremely} + common that you will want to highlight text that was just matched by a + ‘g’ operator. Like, @em{-really} common. So common in fact that the + ‘h’ operator supports a shorthand syntax for this exact situation: + @code {-h//}. Giving an empty regular expression as an argument to an + operator is illegal with the exception of the ‘h’ operator. When this + operator is given an empty argument, it assumes the regular expression + of the previous operator: + } + + figure { + pre {= m4_fmt_code(example-3.sh.gsp) } + } + + h2 #final {-Final Solution} + p {- + So… what was the final solution to my problem? How did I find all the + @code{-} tags in my jobs codebase that were passed the + ‘browser’ attribute? Well here’s how: + } + + figure { + pre {= m4_fmt_code(answer.sh.gsp) } + } + + p {- + Quick, simple, and elegant. Just the way I like it! + } + + h2 #more {-Additional Operators} + p {- + Here I’ve shown you the 3 main operators: ‘x’, ‘g’, and ‘h’. These are + not all however! Each operator also has a capital variant (‘X’, ‘G’, + ‘H’) which behaves the same but instead of working on text that matches + the given pattern, these operators match on text which @em{-doesn’t} + match the given pattern. + } + + p {- + These operators allow for better pattern matching. For example a + pattern to match all numbers which contain a ‘3’ but which aren’t ‘1337’ + could be written as @code{-x/[0-9]+/ g/3/ G/^1337$/}. + } + } + + hr{} + + footer { m4_footer } + } +} diff --git a/src/blog/grab/x.pat.gsp b/src/blog/grab/x.pat.gsp new file mode 100644 index 0000000..1ac0e34 --- /dev/null +++ b/src/blog/grab/x.pat.gsp @@ -0,0 +1 @@ +@span .fn {-x}@span .op {-/}[^\\n].‌+?(?=\\n\\n|$)@span .op {-/} diff --git a/src/blog/index.gsp b/src/blog/index.gsp index 0673da3..fce8d08 100644 --- a/src/blog/index.gsp +++ b/src/blog/index.gsp @@ -28,6 +28,7 @@ html lang="en" { p {-Posts:} ul { + m4_article(grab, {-Making Grep Better}) m4_article(andy-val, {-Values in Andy}) m4_article(new-sh, {-Making a New Shell}) m4_article(extend, {-Extensible Scripting}) diff --git a/src/style.css b/src/style.css index 19a32ab..3941394 100644 --- a/src/style.css +++ b/src/style.css @@ -50,8 +50,15 @@ pre, code, kbd, samp { pre > code { color: var(--fg); } :not(pre) > code { white-space: nowrap; } -pre.js, pre.sh { tab-size: 4; } -pre.pug, pre.gsp { tab-size: 2; } +pre.js, +pre.sh { + tab-size: 4; +} +pre.gsp, +pre.pug, +pre.vue { + tab-size: 2; +} h1 { font-size: 1.8rem; @@ -200,6 +207,11 @@ dl { .gsp-val { color: var(--aqua); } .gsp-op { color: var(--lesser); } +.grab-file { color: #ECABF7; } +.grab-hl { color: #FF4D51; font-weight: bold; } +.grab-pos { color: #47B413; } +.grab-sep { color: #24DFC4; } + .article { display: flex; justify-content: space-between; @@ -264,6 +276,7 @@ dl { abbr.led::before { content: 'Light-Emitting Diode'; } abbr.nas::before { content: 'Network Attached Storage'; } abbr.os::before { content: 'Operating System'; } + abbr.pcre::before { content: 'Perl Compatible Regular Expression'; } abbr.pdf::before { content: 'Portable Document Format'; } abbr.posix::before { content: 'Portable Operating System Interface'; } abbr.qr::before { content: 'Quick Response'; } -- cgit v1.2.3