dedupe blob file data
Ted Unangst tedu@tedunangst.com
Wed, 30 Sep 2020 15:20:40 -0400
4 files changed,
80 insertions(+),
22 deletions(-)
M
database.go
→
database.go
@@ -17,6 +17,7 @@ package main
import ( "bytes" + "crypto/sha512" "database/sql" "encoding/json" "fmt"@@ -509,20 +510,41 @@ fileid, _, err := savefileandxid(name, desc, url, media, local, data)
return fileid, err } +func hashfiledata(data []byte) string { + h := sha512.New512_256() + h.Write(data) + return fmt.Sprintf("%x", h.Sum(nil)) +} + func savefileandxid(name string, desc string, url string, media string, local bool, data []byte) (int64, string, error) { - xid := xfiltrate() - switch media { - case "image/png": - xid += ".png" - case "image/jpeg": - xid += ".jpg" - case "application/pdf": - xid += ".pdf" - case "text/plain": - xid += ".txt" - } - if url == "" { - url = fmt.Sprintf("https://%s/d/%s", serverName, xid) + var xid string + if local { + hash := hashfiledata(data) + row := stmtCheckFileData.QueryRow(hash) + err := row.Scan(&xid) + if err == sql.ErrNoRows { + xid = xfiltrate() + switch media { + case "image/png": + xid += ".png" + case "image/jpeg": + xid += ".jpg" + case "application/pdf": + xid += ".pdf" + case "text/plain": + xid += ".txt" + } + _, err = stmtSaveFileData.Exec(xid, media, hash, data) + if err != nil { + return 0, "", err + } + } else if err != nil { + log.Printf("error checking file hash: %s", err) + return 0, "", err + } + if url == "" { + url = fmt.Sprintf("https://%s/d/%s", serverName, xid) + } } res, err := stmtSaveFile.Exec(xid, name, desc, url, media, local)@@ -530,12 +552,6 @@ if err != nil {
return 0, "", err } fileid, _ := res.LastInsertId() - if local { - _, err = stmtSaveFileData.Exec(xid, media, data) - if err != nil { - return 0, "", err - } - } return fileid, xid, nil }@@ -890,6 +906,7 @@ var stmtHonksFromLongAgo *sql.Stmt
var stmtHonksByHonker, stmtSaveHonk, stmtUserByName, stmtUserByNumber *sql.Stmt var stmtEventHonks, stmtOneBonk, stmtFindZonk, stmtFindXonk, stmtSaveDonk *sql.Stmt var stmtFindFile, stmtGetFileData, stmtSaveFileData, stmtSaveFile *sql.Stmt +var stmtCheckFileData *sql.Stmt var stmtAddDoover, stmtGetDoovers, stmtLoadDoover, stmtZapDoover, stmtOneHonker *sql.Stmt var stmtUntagged, stmtDeleteHonk, stmtDeleteDonks, stmtDeleteOnts, stmtSaveZonker *sql.Stmt var stmtGetZonkers, stmtRecentHonkers, stmtGetXonker, stmtSaveXonker, stmtDeleteXonker *sql.Stmt@@ -951,7 +968,8 @@ stmtSaveDonk = preparetodie(db, "insert into donks (honkid, chonkid, fileid) values (?, ?, ?)")
stmtDeleteDonks = preparetodie(db, "delete from donks where honkid = ?") stmtSaveFile = preparetodie(db, "insert into filemeta (xid, name, description, url, media, local) values (?, ?, ?, ?, ?, ?)") blobdb := openblobdb() - stmtSaveFileData = preparetodie(blobdb, "insert into filedata (xid, media, content) values (?, ?, ?)") + stmtSaveFileData = preparetodie(blobdb, "insert into filedata (xid, media, hash, content) values (?, ?, ?, ?)") + stmtCheckFileData = preparetodie(blobdb, "select xid from filedata where hash = ?") stmtGetFileData = preparetodie(blobdb, "select media, content from filedata where xid = ?") stmtFindXonk = preparetodie(db, "select honkid from honks where userid = ? and xid = ?") stmtFindFile = preparetodie(db, "select fileid, xid from filemeta where url = ? and local = 1")
M
docs/changelog.txt
→
docs/changelog.txt
@@ -2,6 +2,8 @@ changelog
=== next ++ Dedupe blob file data. + - Custom lingo for those who don't like honking. + Better support for rich text bios.
M
upgradedb.go
→
upgradedb.go
@@ -23,7 +23,7 @@ "strings"
"time" ) -var myVersion = 39 +var myVersion = 40 type dbexecer interface { Exec(query string, args ...interface{}) (sql.Result, error)@@ -168,6 +168,39 @@ doordie(db, "update honkers set folxid = abs(random())")
doordie(db, "update config set value = 39 where key = 'dbversion'") fallthrough case 39: + blobdb := openblobdb() + doordie(blobdb, "alter table filedata add column hash text") + doordie(blobdb, "create index idx_filehash on filedata(hash)") + rows, err := blobdb.Query("select xid, content from filedata") + if err != nil { + log.Fatal(err) + } + m := make(map[string]string) + for rows.Next() { + var xid string + var data sql.RawBytes + err := rows.Scan(&xid, &data) + if err != nil { + log.Fatal(err) + } + hash := hashfiledata(data) + m[xid] = hash + } + rows.Close() + tx, err := blobdb.Begin() + if err != nil { + log.Fatal(err) + } + for xid, hash := range m { + doordie(tx, "update filedata set hash = ? where xid = ?", hash, xid) + } + err = tx.Commit() + if err != nil { + log.Fatal(err) + } + doordie(db, "update config set value = 40 where key = 'dbversion'") + fallthrough + case 40: default: log.Fatalf("can't upgrade unknown version %d", dbversion)
M
util.go
→
util.go
@@ -172,12 +172,17 @@ if err != nil {
log.Print(err) return } - _, err = blobdb.Exec("create table filedata (xid text, media text, content blob)") + _, err = blobdb.Exec("create table filedata (xid text, media text, hash text, content blob)") if err != nil { log.Print(err) return } _, err = blobdb.Exec("create index idx_filexid on filedata(xid)") + if err != nil { + log.Print(err) + return + } + _, err = blobdb.Exec("create index idx_filehash on filedata(hash)") if err != nil { log.Print(err) return