Librarian pruning

Package diffs SELECT purpose, COUNT(pdid), SUM(filesize) FROM ( SELECT archive.purpose, packagediff.id AS pdid, libraryfilecontent.filesize, COUNT(sprf.id) FROM packagediff JOIN sourcepackagerelease ON packagediff.to_source = sourcepackagerelease.id JOIN archive ON sourcepackagerelease.upload_archive = archive.id JOIN libraryfilealias ON packagediff.diff_content = libraryfilealias.id JOIN libraryfilecontent ON libraryfilealias.content = libraryfilecontent.id LEFT JOIN sourcepackagereleasefile AS sprf ON sprf.sourcepackagerelease = sourcepackagerelease.id LEFT JOIN libraryfilealias AS lfa2 ON lfa2.id = sprf.libraryfile WHERE lfa2.expires IS NOT NULL GROUP BY archive.purpose, packagediff.id, libraryfilecontent.filesize HAVING COUNT(sprf.id) > 0) AS whatevs GROUP BY purpose; All SPRs purpose | count | sum ---------+--------+-------------- 1 | 122904 | 52406336082 2 | 299916 | 166301964589 4 | 142 | 1038624543 Expired SPRs purpose | count | sum ---------+--------+-------------- 2 | 228250 | 124930677164 4 | 88 | 696706807 We can recover ~125GB. Debian sources select sum(filesize) from ( select distinct on (libraryfilecontent.id) libraryfilecontent.id, libraryfilecontent.filesize from libraryfilealias, libraryfilecontent, sourcepackagereleasefile, sourcepackagepublishinghistory where libraryfilecontent.id = libraryfilealias.content AND sourcepackagepublishinghistory.archive = 3 AND sourcepackagereleasefile.sourcepackagerelease = sourcepackagepublishinghistory.sourcepackagerelease AND sourcepackagereleasefile.libraryfile = libraryfilealias.id AND libraryfilealias.expires is NULL ) as DONOTCARE; sum -------------- 163160261108 We can recover well under 163GB. Most of it is probably in LFCs shared with the Ubuntu primary archive, too. Custom uploads SELECT distroseries.name, customformat, SUM(filesize) FROM packageupload JOIN packageuploadcustom ON packageuploadcustom.packageupload = packageupload.id JOIN libraryfilealias ON libraryfilealias.id = packageuploadcustom.libraryfilealias JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content JOIN distroseries ON distroseries.id = packageupload.distroseries GROUP BY distroseries.name, customformat; name | customformat | sum ----------+--------------+------------- dapper | 0 | 7953438662 | 2 | 1855126 edgy | 0 | 3989003467 | 2 | 7539548 | 3 | 19527463 feisty | 0 | 6936774302 | 2 | 15943395 | 3 | 12111689 gutsy | 0 | 6026825659 | 2 | 13519031 | 3 | 27018915 hardy | 0 | 11659962007 | 1 | 1061623619 | 2 | 29660708 | 3 | 42243159 intrepid | 0 | 7790748428 | 2 | 36114785 | 3 | 143574861 jaunty | 0 | 20580426775 | 1 | 416457777 | 2 | 25339400 | 3 | 201651494 karmic | 0 | 18922297797 | 1 | 1274878005 | 2 | 35827640 | 3 | 259162691 | 4 | 1607553540 lucid | 0 | 15693443523 | 1 | 5143661280 | 2 | 52683490 | 3 | 284948890 | 4 | 2683724680 | 5 | 13 maverick | 0 | 11620892644 | 1 | 28204933972 | 2 | 34029988 | 3 | 124235959 | 4 | 2868157415 | 5 | 140796 natty | 1 | 167744190 ~112GB of d-i tarballs. Most of those are probably cullable. All the other types combined are <50GB. Binaries By archive purpose SELECT purpose, SUM(filesize) FROM ( SELECT DISTINCT on (libraryfilecontent.id) archive.purpose, libraryfilecontent.id, libraryfilecontent.filesize FROM libraryfilealias JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease JOIN binarypackagebuild ON binarypackagebuild.id = binarypackagerelease.build JOIN packagebuild ON packagebuild.id = binarypackagebuild.package_build JOIN archive ON archive.id = packagebuild.archive WHERE libraryfilealias.expires is NULL ) as DONOTCARE GROUP BY purpose; purpose | sum ---------+--------------- 4 | 6531672894 1 | 2258952516680 6 | 18479879302 2 | 1179581775510 Only 1TB of live binaries were built in PPAs. And some of that will be in the primary archive now. So we probably want to kill bits of the primary archive. By series, in the primary archive SELECT distroseries.name, SUM(filesize) FROM ( SELECT DISTINCT on (libraryfilecontent.id) distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize FROM libraryfilealias JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries WHERE libraryfilealias.expires is NULL AND binarypackagepublishinghistory.archive = 1 ) as DONOTCARE JOIN distroseries ON distroseries.id = dsid GROUP BY distroseries.name, distroseries.id ORDER BY distroseries.id; name | sum ----------+-------------- hoary | 27733894316 warty | 14336672506 breezy | 29942826488 dapper | 195109840998 edgy | 24260472130 feisty | 36122734102 gutsy | 46239699592 hardy | 387068256000 intrepid | 84222652064 jaunty | 409992452454 karmic | 439190753940 lucid | 385088521148 maverick | 312278397004 natty | 31982407516 Not entirely reliable -- the DISTINCT is on LFC, so who knows which distroseries things will show up in. Fixing that will make it duplicatastic between series, but: SELECT distroseries.name, SUM(filesize) FROM ( SELECT DISTINCT distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize FROM libraryfilealias JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries WHERE libraryfilealias.expires is NULL AND binarypackagepublishinghistory.archive = 1 ) as DONOTCARE JOIN distroseries ON distroseries.id = dsid GROUP BY distroseries.name, distroseries.id ORDER BY distroseries.id; name | sum ----------+-------------- hoary | 31867969066 warty | 16728308976 breezy | 37643317154 dapper | 206771360446 edgy | 53007114660 feisty | 63610979098 gutsy | 91420134964 hardy | 458761150888 intrepid | 163692052384 jaunty | 490078125746 karmic | 525245050336 lucid | 472053820314 maverick | 390624462548 natty | 76826906594 Comparing to the published binaries: SELECT distroseries.name, SUM(filesize) FROM ( SELECT DISTINCT distroarchseries.distroseries AS dsid, libraryfilecontent.id, libraryfilecontent.filesize FROM libraryfilealias JOIN libraryfilecontent ON libraryfilecontent.id = libraryfilealias.content JOIN binarypackagefile ON binarypackagefile.libraryfile = libraryfilealias.id JOIN binarypackagerelease ON binarypackagerelease.id = binarypackagefile.binarypackagerelease JOIN binarypackagepublishinghistory ON binarypackagepublishinghistory.binarypackagerelease = binarypackagerelease.id JOIN distroarchseries ON distroarchseries.id = binarypackagepublishinghistory.distroarchseries WHERE libraryfilealias.expires is NULL AND binarypackagepublishinghistory.archive = 1 AND binarypackagepublishinghistory.status IN (1, 2, 5) ) as DONOTCARE JOIN distroseries ON distroseries.id = dsid GROUP BY distroseries.name, distroseries.id ORDER BY distroseries.id; name | sum ----------+-------------- hoary | 31867969066 warty | 16728308976 breezy | 37643317154 dapper | 29759999894 edgy | 53007114660 feisty | 31042965794 gutsy | 85342535414 hardy | 110169230692 intrepid | 104598712170 jaunty | 121281205238 karmic | 120171582232 lucid | 113294111936 maverick | 76158710440 natty | 74792133316